Compare commits

...

11 Commits

Author SHA1 Message Date
msramalho
b166d57e61 v0.12.0 bump 2024-08-21 13:34:34 +01:00
msramalho
11c3288267 closes #146 2024-08-21 13:33:58 +01:00
msramalho
004143a58a version bump v0.11.6 2024-07-18 11:27:39 +01:00
msramalho
686f0027c4 adds new entries to example orchestration file 2024-07-18 11:27:15 +01:00
dependabot[bot]
b03cf32c73 Bump authlib from 1.3.0 to 1.3.1 (#144)
Bumps [authlib](https://github.com/lepture/authlib) from 1.3.0 to 1.3.1.
- [Release notes](https://github.com/lepture/authlib/releases)
- [Changelog](https://github.com/lepture/authlib/blob/master/docs/changelog.rst)
- [Commits](https://github.com/lepture/authlib/compare/v1.3.0...v1.3.1)

---
updated-dependencies:
- dependency-name: authlib
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-07-18 11:26:22 +01:00
msramalho
dc9e64397e bumping yt-dlp 2024-07-18 11:23:09 +01:00
msramalho
c7bc5e2988 cleanup 2024-05-15 11:04:29 +01:00
msramalho
1e375bd740 version bump 2024-05-14 16:42:15 +01:00
Miguel Sozinho Ramalho
f8824691dd refactors free twitter archiver strategies (#142) 2024-05-14 16:23:33 +01:00
msramalho
012cc36609 removes deprecated datetime method 2024-05-14 15:54:50 +01:00
Miguel Sozinho Ramalho
7cfe1e39cc #135 fix cleanup of telethon session files (#139)
* closes #135

* version bump
2024-04-16 12:45:45 +01:00
9 changed files with 101 additions and 80 deletions

107
Pipfile.lock generated
View File

@@ -147,11 +147,12 @@
},
"authlib": {
"hashes": [
"sha256:959ea62a5b7b5123c5059758296122b57cd2585ae2ed1c0622c21b371ffdae06",
"sha256:9637e4de1fb498310a56900b3e2043a206b03cb11c05422014b0302cbc814be3"
"sha256:7ae843f03c06c5c0debd63c9db91f9fda64fa62a42a77419fa15fbb7e7a58917",
"sha256:d35800b973099bbadc49b42b256ecb80041ad56b7fe1216a362c7943c088f377"
],
"index": "pypi",
"markers": "python_version >= '3.8'",
"version": "==1.3.0"
"version": "==1.3.1"
},
"beautifulsoup4": {
"hashes": [
@@ -273,7 +274,7 @@
"sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2",
"sha256:fdc3ff3bfccdc6b9cc7c342c03aa2400683f0cb891d46e94b64a197910dc4064"
],
"markers": "platform_python_implementation >= 'CPython'",
"markers": "implementation_name == 'cpython'",
"version": "==1.1.0"
},
"bs4": {
@@ -294,11 +295,11 @@
},
"certifi": {
"hashes": [
"sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f",
"sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"
"sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b",
"sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"
],
"markers": "python_version >= '3.6'",
"version": "==2024.2.2"
"version": "==2024.7.4"
},
"certvalidator": {
"hashes": [
@@ -459,7 +460,7 @@
"sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519",
"sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"
],
"markers": "python_version >= '3.6'",
"markers": "python_full_version >= '3.7.0'",
"version": "==3.3.2"
},
"click": {
@@ -479,42 +480,42 @@
},
"cryptography": {
"hashes": [
"sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee",
"sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576",
"sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d",
"sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30",
"sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413",
"sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb",
"sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da",
"sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4",
"sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd",
"sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc",
"sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8",
"sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1",
"sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc",
"sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e",
"sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8",
"sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940",
"sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400",
"sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7",
"sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16",
"sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278",
"sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74",
"sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec",
"sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1",
"sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2",
"sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c",
"sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922",
"sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a",
"sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6",
"sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1",
"sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e",
"sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac",
"sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7"
"sha256:013629ae70b40af70c9a7a5db40abe5d9054e6f4380e50ce769947b73bf3caad",
"sha256:2346b911eb349ab547076f47f2e035fc8ff2c02380a7cbbf8d87114fa0f1c583",
"sha256:2f66d9cd9147ee495a8374a45ca445819f8929a3efcd2e3df6428e46c3cbb10b",
"sha256:2f88d197e66c65be5e42cd72e5c18afbfae3f741742070e3019ac8f4ac57262c",
"sha256:31f721658a29331f895a5a54e7e82075554ccfb8b163a18719d342f5ffe5ecb1",
"sha256:343728aac38decfdeecf55ecab3264b015be68fc2816ca800db649607aeee648",
"sha256:5226d5d21ab681f432a9c1cf8b658c0cb02533eece706b155e5fbd8a0cdd3949",
"sha256:57080dee41209e556a9a4ce60d229244f7a66ef52750f813bfbe18959770cfba",
"sha256:5a94eccb2a81a309806027e1670a358b99b8fe8bfe9f8d329f27d72c094dde8c",
"sha256:6b7c4f03ce01afd3b76cf69a5455caa9cfa3de8c8f493e0d3ab7d20611c8dae9",
"sha256:7016f837e15b0a1c119d27ecd89b3515f01f90a8615ed5e9427e30d9cdbfed3d",
"sha256:81884c4d096c272f00aeb1f11cf62ccd39763581645b0812e99a91505fa48e0c",
"sha256:81d8a521705787afe7a18d5bfb47ea9d9cc068206270aad0b96a725022e18d2e",
"sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2",
"sha256:961e61cefdcb06e0c6d7e3a1b22ebe8b996eb2bf50614e89384be54c48c6b63d",
"sha256:9c0c1716c8447ee7dbf08d6db2e5c41c688544c61074b54fc4564196f55c25a7",
"sha256:a0608251135d0e03111152e41f0cc2392d1e74e35703960d4190b2e0f4ca9c70",
"sha256:a0c5b2b0585b6af82d7e385f55a8bc568abff8923af147ee3c07bd8b42cda8b2",
"sha256:ad803773e9df0b92e0a817d22fd8a3675493f690b96130a5e24f1b8fabbea9c7",
"sha256:b297f90c5723d04bcc8265fc2a0f86d4ea2e0f7ab4b6994459548d3a6b992a14",
"sha256:ba4f0a211697362e89ad822e667d8d340b4d8d55fae72cdd619389fb5912eefe",
"sha256:c4783183f7cb757b73b2ae9aed6599b96338eb957233c58ca8f49a49cc32fd5e",
"sha256:c9bb2ae11bfbab395bdd072985abde58ea9860ed84e59dbc0463a5d0159f5b71",
"sha256:cafb92b2bc622cd1aa6a1dce4b93307792633f4c5fe1f46c6b97cf67073ec961",
"sha256:d45b940883a03e19e944456a558b67a41160e367a719833c53de6911cabba2b7",
"sha256:dc0fdf6787f37b1c6b08e6dfc892d9d068b5bdb671198c72072828b80bd5fe4c",
"sha256:dea567d1b0e8bc5764b9443858b673b734100c2871dc93163f58c46a97a83d28",
"sha256:dec9b018df185f08483f294cae6ccac29e7a6e0678996587363dc352dc65c842",
"sha256:e3ec3672626e1b9e55afd0df6d774ff0e953452886e06e0f1eb7eb0c832e8902",
"sha256:e599b53fd95357d92304510fb7bda8523ed1f79ca98dce2f43c115950aa78801",
"sha256:fa76fbb7596cc5839320000cdd5d0955313696d9511debab7ee7278fc8b5c84a",
"sha256:fff12c88a672ab9c9c1cf7b0c80e3ad9e2ebd9d828d955c126be4fd3e5578c9e"
],
"index": "pypi",
"markers": "python_version >= '3.7'",
"version": "==42.0.5"
"version": "==42.0.8"
},
"dataclasses-json": {
"hashes": [
@@ -1704,11 +1705,11 @@
"socks"
],
"hashes": [
"sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f",
"sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"
"sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760",
"sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"
],
"markers": "python_version >= '3.7'",
"version": "==2.31.0"
"markers": "python_version >= '3.8'",
"version": "==2.32.3"
},
"requests-oauthlib": {
"hashes": [
@@ -1905,11 +1906,11 @@
"socks"
],
"hashes": [
"sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d",
"sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"
"sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472",
"sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'",
"version": "==2.2.1"
"markers": "python_version >= '3.8'",
"version": "==2.2.2"
},
"vk-api": {
"hashes": [
@@ -2010,7 +2011,7 @@
"sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8",
"sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"
],
"markers": "python_version >= '3.7'",
"markers": "python_version >= '3.8'",
"version": "==12.0"
},
"werkzeug": {
@@ -2127,12 +2128,12 @@
},
"yt-dlp": {
"hashes": [
"sha256:7ee90572b4d313b582b99c89e4eccf779b57ff54edc331873c6b3fba77faa8b0",
"sha256:d6ff6798bd114cc48763564fcb2f296464ec1604f731e69b07a8814c89b170a2"
"sha256:2a59d9e65ef6dadb1ff318346d04403664c3fa395e098fcd0d7ad626ef9f8a89",
"sha256:f4614e1c710fcb387bf152d2162868c565ed3f675647ecaa19dab54e581780eb"
],
"index": "pypi",
"markers": "python_version >= '3.8'",
"version": "==2024.4.9"
"version": "==2024.7.15.232803.dev0"
}
},
"develop": {

View File

@@ -47,7 +47,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
<details><summary><code>Python package instructions</code></summary>
1. make sure you have python 3.8 or higher installed
1. make sure you have python 3.10 or higher installed
2. install the package `pip/pipenv/conda install auto-archiver`
3. test it's installed with `auto-archiver --help`
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
@@ -108,7 +108,7 @@ configurations:
# ... configurations for the other steps here ...
```
To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
To see all available `steps` (which archivers, storages, databases, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:

View File

@@ -16,8 +16,13 @@ steps:
# - wacz_archiver_enricher
enrichers:
- hash_enricher
# - meta_enricher
# - metadata_enricher
# - screenshot_enricher
# - pdq_hash_enricher
# - ssl_enricher
# - timestamping_enricher
# - whisper_enricher
# - thumbnail_enricher
# - wayback_archiver_enricher
# - wacz_archiver_enricher

View File

@@ -27,7 +27,7 @@ package_dir=
=src
packages=find:
find_packages=true
python_requires = >=3.8
python_requires = >=3.10
[options.package_data]
* = *.html

View File

@@ -54,8 +54,9 @@ class InstagramTbotArchiver(Archiver):
def cleanup(self) -> None:
logger.info(f"CLEANUP {self.name}.")
if os.path.exists(self.session_file):
os.remove(self.session_file)
session_file_name = self.session_file + ".session"
if os.path.exists(session_file_name):
os.remove(session_file_name)
def download(self, item: Metadata) -> Metadata:
url = item.get_url()

View File

@@ -101,8 +101,9 @@ class TelethonArchiver(Archiver):
def cleanup(self) -> None:
logger.info(f"CLEANUP {self.name}.")
if os.path.exists(self.session_file):
os.remove(self.session_file)
session_file_name = self.session_file + ".session"
if os.path.exists(session_file_name):
os.remove(session_file_name)
def download(self, item: Metadata) -> Metadata:
"""

View File

@@ -1,4 +1,5 @@
import re, requests, mimetypes, json
from typing import Union
from datetime import datetime
from loguru import logger
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
@@ -31,7 +32,7 @@ class TwitterArchiver(Archiver):
# expand URL if t.co and clean tracker GET params
if 'https://t.co/' in url:
try:
r = requests.get(url)
r = requests.get(url, timeout=30)
logger.debug(f'Expanded url {url} to {r.url}')
url = r.url
except:
@@ -45,19 +46,31 @@ class TwitterArchiver(Archiver):
can handle private/public channels
"""
url = item.get_url()
# detect URLs that we definitely cannot handle
username, tweet_id = self.get_username_tweet_id(url)
if not username: return False
result = Metadata()
strategies = [self.download_yt_dlp, self.download_snscrape, self.download_syndication]
for strategy in strategies:
logger.debug(f"Trying {strategy.__name__} for {url=}")
try:
result = strategy(item, url, tweet_id)
if result: return result
except Exception as ex:
logger.error(f"Failed to download {url} with {strategy.__name__}: {type(ex).__name__} occurred. args: {ex.args}")
logger.warning(f"No free strategy worked for {url}")
return False
def download_snscrape(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
scr = TwitterTweetScraper(tweet_id)
try:
tweet = next(scr.get_items())
except Exception as ex:
logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
return self.download_alternative(item, url, tweet_id)
logger.warning(f"SNSCRAPE FAILED, can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
return False
result = Metadata()
result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date)
if tweet.media is None:
logger.debug(f'No media found, archiving tweet text only')
@@ -87,7 +100,7 @@ class TwitterArchiver(Archiver):
return result.success("twitter-snscrape")
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
"""
Hack alternative working again.
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
@@ -95,14 +108,13 @@ class TwitterArchiver(Archiver):
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
"""
logger.debug(f"Trying twitter hack for {url=}")
result = Metadata()
hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
r = requests.get(hack_url)
if r.status_code != 200 or r.json()=={}:
logger.warning(f"Failed to get tweet information from {hack_url}, trying ytdl")
return self.download_ytdl(item, url, tweet_id)
logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
return False
result = Metadata()
tweet = r.json()
urls = []
@@ -128,9 +140,9 @@ class TwitterArchiver(Archiver):
result.add_media(media)
result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result.success("twitter-hack")
def download_ytdl(self, item: Metadata, url:str, tweet_id:str) -> Metadata:
return result.success("twitter-syndication")
def download_yt_dlp(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
downloader = YoutubeDL()
tie = TwitterIE(downloader)
tweet = tie._extract_status(tweet_id)
@@ -141,6 +153,7 @@ class TwitterArchiver(Archiver):
.set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
if not tweet.get("entities", {}).get("media"):
logger.debug('No media found, archiving tweet text only')
result.status = "twitter-ytdl"
return result
for i, tw_media in enumerate(tweet["entities"]["media"]):
media = Media(filename="")
@@ -160,7 +173,6 @@ class TwitterArchiver(Archiver):
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
result.add_media(media)
return result.success("twitter-ytdl")
def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle

View File

@@ -98,11 +98,12 @@ class YoutubeDLArchiver(Archiver):
result.set("comments", [{
"text": c["text"],
"author": c["author"],
"timestamp": datetime.datetime.utcfromtimestamp(c.get("timestamp")).replace(tzinfo=datetime.timezone.utc)
"timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
} for c in info.get("comments", [])])
if (timestamp := info.get("timestamp")):
timestamp = datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
#TODO: fix deprecated timestamp,
timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
result.set_timestamp(timestamp)
if (upload_date := info.get("upload_date")):
upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)

View File

@@ -1,9 +1,9 @@
_MAJOR = "0"
_MINOR = "11"
_MINOR = "12"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "2"
_PATCH = "0"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""