refactors free twitter archiver strategies (#142 )

removes deprecated datetime method
#135 fix cleanup of telethon session files (#139 )
2026-06-12 13:18:28 +03:00 · 2024-05-14 16:23:33 +01:00 · 2024-05-14 15:54:50 +01:00 · 2024-04-16 12:45:45 +01:00
6 changed files with 110 additions and 23 deletions
--- a/src/auto_archiver/archivers/instagram_tbot_archiver.py
+++ b/src/auto_archiver/archivers/instagram_tbot_archiver.py
@@ -54,8 +54,9 @@ class InstagramTbotArchiver(Archiver):
    def cleanup(self) -> None:
        logger.info(f"CLEANUP {self.name}.")
-        if os.path.exists(self.session_file):
+        session_file_name = self.session_file + ".session"
-            os.remove(self.session_file)
+        if os.path.exists(session_file_name):
            os.remove(session_file_name)
    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()
--- a/src/auto_archiver/archivers/telethon_archiver.py
+++ b/src/auto_archiver/archivers/telethon_archiver.py
@@ -101,8 +101,9 @@ class TelethonArchiver(Archiver):
    def cleanup(self) -> None:
        logger.info(f"CLEANUP {self.name}.")
-        if os.path.exists(self.session_file):
+        session_file_name = self.session_file + ".session"
-            os.remove(self.session_file)
+        if os.path.exists(session_file_name):
            os.remove(session_file_name)
    def download(self, item: Metadata) -> Metadata:
        """
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -1,4 +1,5 @@
 import re, requests, mimetypes, json
 from typing import Union
 from datetime import datetime
 from loguru import logger
 from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
@@ -31,7 +32,7 @@ class TwitterArchiver(Archiver):
        # expand URL if t.co and clean tracker GET params
        if 'https://t.co/' in url:
            try:
-                r = requests.get(url)
+                r = requests.get(url, timeout=30)
                logger.debug(f'Expanded url {url} to {r.url}')
                url = r.url
            except:
@@ -45,19 +46,31 @@ class TwitterArchiver(Archiver):
        can handle private/public channels
        """
        url = item.get_url()
        # detect URLs that we definitely cannot handle
        username, tweet_id = self.get_username_tweet_id(url)
        if not username: return False
-        result = Metadata()
+        strategies = [self.download_yt_dlp, self.download_snscrape, self.download_syndication]
        for strategy in strategies:
            logger.debug(f"Trying {strategy.__name__} for {url=}")
            try:
                result = strategy(item, url, tweet_id)
                if result: return result
            except Exception as ex:
                logger.error(f"Failed to download {url} with {strategy.__name__}: {type(ex).__name__} occurred. args: {ex.args}")
        logger.warning(f"No free strategy worked for {url}")
        return False
    def download_snscrape(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        scr = TwitterTweetScraper(tweet_id)
        try:
            tweet = next(scr.get_items())
        except Exception as ex:
-            logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
+            logger.warning(f"SNSCRAPE FAILED, can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
-            return self.download_alternative(item, url, tweet_id)
+            return False
        result = Metadata()
        result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date)
        if tweet.media is None:
            logger.debug(f'No media found, archiving tweet text only')
@@ -87,7 +100,7 @@ class TwitterArchiver(Archiver):
        return result.success("twitter-snscrape")
-    def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
+    def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        """
        Hack alternative working again.
        https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
@@ -95,14 +108,13 @@ class TwitterArchiver(Archiver):
        next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
        """
        logger.debug(f"Trying twitter hack for {url=}")
        result = Metadata()
        hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
        r = requests.get(hack_url)
        if r.status_code != 200 or r.json()=={}: 
-            logger.warning(f"Failed to get tweet information from {hack_url}, trying ytdl")
+            logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
-            return self.download_ytdl(item, url, tweet_id)
+            return False
        result = Metadata()
        tweet = r.json()
        urls = []
@@ -128,9 +140,9 @@ class TwitterArchiver(Archiver):
            result.add_media(media)
        result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
-        return result.success("twitter-hack")
+        return result.success("twitter-syndication")
-    def download_ytdl(self, item: Metadata, url:str, tweet_id:str) -> Metadata:
+    def download_yt_dlp(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        downloader = YoutubeDL()
        tie = TwitterIE(downloader)
        tweet = tie._extract_status(tweet_id)
@@ -141,6 +153,7 @@ class TwitterArchiver(Archiver):
            .set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
        if not tweet.get("entities", {}).get("media"):
            logger.debug('No media found, archiving tweet text only')
            result.status = "twitter-ytdl"
            return result
        for i, tw_media in enumerate(tweet["entities"]["media"]):
            media = Media(filename="")
@@ -161,7 +174,6 @@ class TwitterArchiver(Archiver):
            result.add_media(media)
        return result.success("twitter-ytdl")
    def get_username_tweet_id(self, url):
        # detect URLs that we definitely cannot handle
        matches = self.link_pattern.findall(url)
--- a/src/auto_archiver/archivers/youtubedl_archiver.py
+++ b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -98,11 +98,12 @@ class YoutubeDLArchiver(Archiver):
            result.set("comments", [{
                "text": c["text"],
                "author": c["author"], 
-                "timestamp": datetime.datetime.utcfromtimestamp(c.get("timestamp")).replace(tzinfo=datetime.timezone.utc)
+                "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
            } for c in info.get("comments", [])])
        if (timestamp := info.get("timestamp")):
-            timestamp = datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
+            #TODO: fix deprecated timestamp, 
            timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
            result.set_timestamp(timestamp)
        if (upload_date := info.get("upload_date")):
            upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -3,7 +3,7 @@ _MAJOR = "0"
 _MINOR = "11"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "2"
+_PATCH = "3"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
--- a/todo.md
+++ b/todo.md
@@ -0,0 +1,72 @@
 ------ AA + API
 2024-03-05 11:57:12.910 | ERROR    | auto_archiver.core.orchestrator:archive:116 - ERROR enricher wacz_archiver_enricher: 'WaczArchiverEnricher' object has no attri bute 'browsertrix_home_host': Traceback (most recent call last):
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/core/orchestrator.py", line 114, in archive
    try: e.enrich(result)
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/enrichers/wacz_enricher.py", line 70, in enrich
    browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
 AttributeError: 'WaczArchiverEnricher' object has no attribute 'browsertrix_home_host'
 -------- API
 2024-02-29 17:12:06.078 | WARNING  | worker:task_failure_notifier:100 - 😅 From task_failure_notifier ==> Task failed successfully! 
 2024-02-29 17:12:06.078 | ERROR    | worker:task_failure_notifier:101 - list index out of range
 2024-02-29 17:12:06.078 | ERROR    | worker:task_failure_notifier:102 - <traceback object at 0x7f3db75446c0>
 2024-02-29 17:12:06.079 | ERROR    | worker:task_failure_notifier:103 -   File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/trace.py", line 412, in trace_task
    R = retval = fun(*args, **kwargs)
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/trace.py", line 704, in __protected_call__
    return self.run(*args, **kwargs)
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/autoretry.py", line 50, in run
    raise task.retry(exc=exc, **retry_kwargs)
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/task.py", line 706, in retry
    raise_with_context(exc)
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/autoretry.py", line 35, in run
    return task._orig_run(*args, **kwargs)
  File "/app/worker.py", line 35, in create_archive_task
    invalid = is_group_invalid_for_user(archive.public, archive.group_id, archive.author_id)
  File "/app/worker.py", line 160, in is_group_invalid_for_user
    if not crud.is_user_in_group(session, group_id, author_id):
  File "/app/db/crud.py", line 93, in is_user_in_group
    return len(group_name) and len(email) and group_name in get_user_groups(db, email)
  File "/app/db/crud.py", line 103, in get_user_groups
    domain_level_groups = DOMAIN_GROUPS.get(email.split('@')[1], [])
 ------------------ API
 [parameters: (('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/b0c88017bb047ff43fc49907/3811d9d0c74541929f4a72d0.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'profile_picture'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/ff30ece740738d060229c5da/e43172422e274c2a8f9529ff.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'post 3308982791113602520'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/23f218c518e2d5a17fe856bd/bad85f53a8e54c26991cdff9.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790987757405'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/2ffc8c65d6bfec7ef5402bda/520a10e7a7e14028be1cc1c8.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790970975889'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/da0486669cbb102e6221d94c/65b151ee59114ea5b61cfe96.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790979533474'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/d8c4db3247780324b6fa6d4a/df195ff24b104182bf255610.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790979331432'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/715432d8038abe50c8c89994/f49554bf621848e5881388ee.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982791122025152'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/f143ce41b1eb329a0c404448/6b3d604f676c4ef583b54ff3.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790987824546')  ... displaying 10 of 253 total bound parameter sets ...  ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/385ac59684e1148643c762de/0d72d23e6652474a977fe1d6', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'timestamp_authority_filesno-dups/385ac59684e1148643c762de/0d72d23e6652474a977fe1d6_3.0'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/261cb9cefe3c373ad5f7f305/545253c297124d31bbf817c9.html', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', '_final_media'))]
 2024-02-25 15:43:13.142 | DEBUG    | worker:convert_if_media:200 - error parsing {'pk': '2045172809601551458', 'id': '2045172809601551458_178884643', 'code': 'Bxh6dWkloxi', 'taken_at': '2019-05-16T16:20:16Z', 'media_type': 1, 'product_type': 'story', 'thumbnail_url': 'https://scontent-sjc3-1.cdninstagram.com/v/t51.12442-15/58604296_289070981969290_2714055836620897014_n.jpg?stp=dst-jpg_e35&efg=eyJ2ZW5jb2RlX3RhZyI6ImltYWdlX3VybGdlbi4xMDI0eDE4MjAuc2RyIn0&_nc_ht=scontent-sjc3-1.cdninstagram.com&_nc_cat=110&_nc_ohc=ri2YWjVH4dkAX9TFQox&edm=ANmP7GQBAAAA&ccb=7-5&ig_cache_key=MjA0NTE3MjgwOTYwMTU1MTQ1OA%3D%3D.2-ccb7-5&oh=00_AfCIxHr9jkUmeq9NgbmTpWtURV_eu5JGMRbrsc0WwyO59g&oe=65DD0180&_nc_sid=982cc7', 'user': {'pk': '178884643'}, 'locations': [{'location': {'pk': 218723854, 'name': 'Montañita, Ecuador', 'lng': -80.751982661304, 'lat': -1.829127033905}}]} : 'filename'
 2024-02-25 15:43:13.224 | WARNING  | worker:create_sheet_task:84 - cached result detected: (sqlite3.IntegrityError) UNIQUE constraint failed: archive_urls.url, archive_urls.archive_id
 [SQL: INSERT INTO archive_urls (url, archive_id, "key") VALUES (?, ?, ?)]
 [parameters: (('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/3caafdcb057000f8c70610fe/e45ec0b09b854333afa22c1c.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'profile_picture'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/58f0b188e55ef64b96d04b35/94cd9b3d2d5c4deebb9ccd13.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'story 3310558716812786504_178884643'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/9225487f1a559da4060dcf8e/fe55f26616114856a35cc357.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'post 3301802228841572226'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/e26b8bc09aa4febc53d4a42d/50330faff71d4dc28078d5e6.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228850013597'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/bec948c1bf5b87f1f0923e1d/e4c3ddef88ad487ab0305402.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228917101263'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/38c66be1f1898ea7e0d20d9d/9143edba90184ea387fc856d.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228916961531'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/89f16b246931dc98792d570c/859ea90fd5a84d3cbf1a7b46.mp4', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'video 3301801735390140959'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/7ee6f00d8a05e22996307c20/5f5faa30541d406e9f0ea1f2.mp4', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'video 3301801735750883968')  ... displaying 10 of 587 total bound parameter sets ...  ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/ef3bc69b89e40504cd936e8a/f4a13bf448c04af0b8c3aeae', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'timestamp_authority_filesno-dups/ef3bc69b89e40504cd936e8a/f4a13bf448c04af0b8c3aeae_3.0'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/561ea3dc87c229eb239266f2/8c292219a03445e396001026.html', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', '_final_media'))]
 2024-02-27 13:03:04.585 | ERROR    | auto_archiver.core.orchestrator:archive:128 - ERROR database gsheet_db: {'code': 400, 'message': 'Invalid data[4]: Your input contains more than the maximum of 50000 characters in a single cell.', 'status': 'INVALID_ARGUMENT'}: Traceback (most recent call last):
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/core/orchestrator.py", line 126, in archive
    try: d.done(result)
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/databases/gsheet_db.py", line 94, in done
    gw.batch_set_cell(cell_updates)
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/utils/gworksheet.py", line 104, in batch_set_cell
    self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/worksheet.py", line 1361, in batch_update
    response = self.client.values_batch_update(self.spreadsheet_id, body=body)
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/http_client.py", line 263, in values_batch_update
    r = self.request("post", url, json=body)
  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/http_client.py", line 123, in request
    raise APIError(response)
 gspread.exceptions.APIError: {'code': 400, 'message': 'Invalid data[4]: Your input contains more than the maximum of 50000 characters in a single cell.', 'status': 'INVALID_ARGUMENT'}
Author	SHA1	Message	Date
Miguel Sozinho Ramalho	f8824691dd	refactors free twitter archiver strategies (#142 )	2024-05-14 16:23:33 +01:00
msramalho	012cc36609	removes deprecated datetime method	2024-05-14 15:54:50 +01:00
Miguel Sozinho Ramalho	7cfe1e39cc	#135 fix cleanup of telethon session files (#139 ) * closes #135 * version bump	2024-04-16 12:45:45 +01:00