refactors free twitter archiver strategies (#142)

2026-06-08 03:18:28 +03:00 · 2024-05-14 16:23:33 +01:00
parent 012cc36609
commit f8824691dd
2 changed files with 100 additions and 16 deletions
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -1,4 +1,5 @@
 import re, requests, mimetypes, json
+from typing import Union
 from datetime import datetime
 from loguru import logger
 from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
@@ -31,7 +32,7 @@ class TwitterArchiver(Archiver):
        # expand URL if t.co and clean tracker GET params
        if 'https://t.co/' in url:
            try:
-                r = requests.get(url)
+                r = requests.get(url, timeout=30)
                logger.debug(f'Expanded url {url} to {r.url}')
                url = r.url
            except:
@@ -45,19 +46,31 @@ class TwitterArchiver(Archiver):
        can handle private/public channels
        """
        url = item.get_url()
-        # detect URLs that we definitely cannot handle
        username, tweet_id = self.get_username_tweet_id(url)
        if not username: return False

-        result = Metadata()
+        strategies = [self.download_yt_dlp, self.download_snscrape, self.download_syndication]
+        for strategy in strategies:
+            logger.debug(f"Trying {strategy.__name__} for {url=}")
+            try:
+                result = strategy(item, url, tweet_id)
+                if result: return result
+            except Exception as ex:
+                logger.error(f"Failed to download {url} with {strategy.__name__}: {type(ex).__name__} occurred. args: {ex.args}")
+        
+        logger.warning(f"No free strategy worked for {url}")
+        return False

+        
+    def download_snscrape(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        scr = TwitterTweetScraper(tweet_id)
        try:
            tweet = next(scr.get_items())
        except Exception as ex:
-            logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
-            return self.download_alternative(item, url, tweet_id)
-
+            logger.warning(f"SNSCRAPE FAILED, can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
+            return False
+        
+        result = Metadata()
        result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date)
        if tweet.media is None:
            logger.debug(f'No media found, archiving tweet text only')
@@ -87,7 +100,7 @@ class TwitterArchiver(Archiver):

        return result.success("twitter-snscrape")

-    def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
+    def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        """
        Hack alternative working again.
        https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
@@ -95,14 +108,13 @@ class TwitterArchiver(Archiver):
        next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
        """

-        logger.debug(f"Trying twitter hack for {url=}")
-        result = Metadata()
-
        hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
        r = requests.get(hack_url)
        if r.status_code != 200 or r.json()=={}: 
-            logger.warning(f"Failed to get tweet information from {hack_url}, trying ytdl")
-            return self.download_ytdl(item, url, tweet_id)
+            logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
+            return False
+        
+        result = Metadata()
        tweet = r.json()

        urls = []
@@ -128,9 +140,9 @@ class TwitterArchiver(Archiver):
            result.add_media(media)

        result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
-        return result.success("twitter-hack")
-    
-    def download_ytdl(self, item: Metadata, url:str, tweet_id:str) -> Metadata:
+        return result.success("twitter-syndication")
+
+    def download_yt_dlp(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
        downloader = YoutubeDL()
        tie = TwitterIE(downloader)
        tweet = tie._extract_status(tweet_id)
@@ -141,6 +153,7 @@ class TwitterArchiver(Archiver):
            .set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
        if not tweet.get("entities", {}).get("media"):
            logger.debug('No media found, archiving tweet text only')
+            result.status = "twitter-ytdl"
            return result
        for i, tw_media in enumerate(tweet["entities"]["media"]):
            media = Media(filename="")
@@ -160,7 +173,6 @@ class TwitterArchiver(Archiver):
            media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
            result.add_media(media)
        return result.success("twitter-ytdl")
-        

    def get_username_tweet_id(self, url):
        # detect URLs that we definitely cannot handle
--- a/todo.md
+++ b/todo.md
@@ -0,0 +1,72 @@
+------ AA + API
+
+
+
+2024-03-05 11:57:12.910 | ERROR    | auto_archiver.core.orchestrator:archive:116 - ERROR enricher wacz_archiver_enricher: 'WaczArchiverEnricher' object has no attri bute 'browsertrix_home_host': Traceback (most recent call last):
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/core/orchestrator.py", line 114, in archive
+    try: e.enrich(result)
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/enrichers/wacz_enricher.py", line 70, in enrich
+    browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
+AttributeError: 'WaczArchiverEnricher' object has no attribute 'browsertrix_home_host'
+
+
+
+-------- API
+
+
+2024-02-29 17:12:06.078 | WARNING  | worker:task_failure_notifier:100 - 😅 From task_failure_notifier ==> Task failed successfully! 
+2024-02-29 17:12:06.078 | ERROR    | worker:task_failure_notifier:101 - list index out of range
+2024-02-29 17:12:06.078 | ERROR    | worker:task_failure_notifier:102 - <traceback object at 0x7f3db75446c0>
+2024-02-29 17:12:06.079 | ERROR    | worker:task_failure_notifier:103 -   File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/trace.py", line 412, in trace_task
+    R = retval = fun(*args, **kwargs)
+
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/trace.py", line 704, in __protected_call__
+    return self.run(*args, **kwargs)
+
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/autoretry.py", line 50, in run
+    raise task.retry(exc=exc, **retry_kwargs)
+
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/task.py", line 706, in retry
+    raise_with_context(exc)
+
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/autoretry.py", line 35, in run
+    return task._orig_run(*args, **kwargs)
+
+  File "/app/worker.py", line 35, in create_archive_task
+    invalid = is_group_invalid_for_user(archive.public, archive.group_id, archive.author_id)
+
+  File "/app/worker.py", line 160, in is_group_invalid_for_user
+    if not crud.is_user_in_group(session, group_id, author_id):
+
+  File "/app/db/crud.py", line 93, in is_user_in_group
+    return len(group_name) and len(email) and group_name in get_user_groups(db, email)
+
+  File "/app/db/crud.py", line 103, in get_user_groups
+    domain_level_groups = DOMAIN_GROUPS.get(email.split('@')[1], [])
+
+------------------ API
+
+[parameters: (('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/b0c88017bb047ff43fc49907/3811d9d0c74541929f4a72d0.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'profile_picture'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/ff30ece740738d060229c5da/e43172422e274c2a8f9529ff.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'post 3308982791113602520'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/23f218c518e2d5a17fe856bd/bad85f53a8e54c26991cdff9.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790987757405'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/2ffc8c65d6bfec7ef5402bda/520a10e7a7e14028be1cc1c8.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790970975889'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/da0486669cbb102e6221d94c/65b151ee59114ea5b61cfe96.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790979533474'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/d8c4db3247780324b6fa6d4a/df195ff24b104182bf255610.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790979331432'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/715432d8038abe50c8c89994/f49554bf621848e5881388ee.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982791122025152'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/f143ce41b1eb329a0c404448/6b3d604f676c4ef583b54ff3.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790987824546')  ... displaying 10 of 253 total bound parameter sets ...  ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/385ac59684e1148643c762de/0d72d23e6652474a977fe1d6', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'timestamp_authority_filesno-dups/385ac59684e1148643c762de/0d72d23e6652474a977fe1d6_3.0'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/261cb9cefe3c373ad5f7f305/545253c297124d31bbf817c9.html', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', '_final_media'))]
+
+
+2024-02-25 15:43:13.142 | DEBUG    | worker:convert_if_media:200 - error parsing {'pk': '2045172809601551458', 'id': '2045172809601551458_178884643', 'code': 'Bxh6dWkloxi', 'taken_at': '2019-05-16T16:20:16Z', 'media_type': 1, 'product_type': 'story', 'thumbnail_url': 'https://scontent-sjc3-1.cdninstagram.com/v/t51.12442-15/58604296_289070981969290_2714055836620897014_n.jpg?stp=dst-jpg_e35&efg=eyJ2ZW5jb2RlX3RhZyI6ImltYWdlX3VybGdlbi4xMDI0eDE4MjAuc2RyIn0&_nc_ht=scontent-sjc3-1.cdninstagram.com&_nc_cat=110&_nc_ohc=ri2YWjVH4dkAX9TFQox&edm=ANmP7GQBAAAA&ccb=7-5&ig_cache_key=MjA0NTE3MjgwOTYwMTU1MTQ1OA%3D%3D.2-ccb7-5&oh=00_AfCIxHr9jkUmeq9NgbmTpWtURV_eu5JGMRbrsc0WwyO59g&oe=65DD0180&_nc_sid=982cc7', 'user': {'pk': '178884643'}, 'locations': [{'location': {'pk': 218723854, 'name': 'Montañita, Ecuador', 'lng': -80.751982661304, 'lat': -1.829127033905}}]} : 'filename'
+2024-02-25 15:43:13.224 | WARNING  | worker:create_sheet_task:84 - cached result detected: (sqlite3.IntegrityError) UNIQUE constraint failed: archive_urls.url, archive_urls.archive_id
+[SQL: INSERT INTO archive_urls (url, archive_id, "key") VALUES (?, ?, ?)]
+[parameters: (('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/3caafdcb057000f8c70610fe/e45ec0b09b854333afa22c1c.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'profile_picture'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/58f0b188e55ef64b96d04b35/94cd9b3d2d5c4deebb9ccd13.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'story 3310558716812786504_178884643'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/9225487f1a559da4060dcf8e/fe55f26616114856a35cc357.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'post 3301802228841572226'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/e26b8bc09aa4febc53d4a42d/50330faff71d4dc28078d5e6.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228850013597'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/bec948c1bf5b87f1f0923e1d/e4c3ddef88ad487ab0305402.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228917101263'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/38c66be1f1898ea7e0d20d9d/9143edba90184ea387fc856d.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228916961531'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/89f16b246931dc98792d570c/859ea90fd5a84d3cbf1a7b46.mp4', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'video 3301801735390140959'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/7ee6f00d8a05e22996307c20/5f5faa30541d406e9f0ea1f2.mp4', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'video 3301801735750883968')  ... displaying 10 of 587 total bound parameter sets ...  ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/ef3bc69b89e40504cd936e8a/f4a13bf448c04af0b8c3aeae', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'timestamp_authority_filesno-dups/ef3bc69b89e40504cd936e8a/f4a13bf448c04af0b8c3aeae_3.0'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/561ea3dc87c229eb239266f2/8c292219a03445e396001026.html', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', '_final_media'))]
+
+
+
+2024-02-27 13:03:04.585 | ERROR    | auto_archiver.core.orchestrator:archive:128 - ERROR database gsheet_db: {'code': 400, 'message': 'Invalid data[4]: Your input contains more than the maximum of 50000 characters in a single cell.', 'status': 'INVALID_ARGUMENT'}: Traceback (most recent call last):
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/core/orchestrator.py", line 126, in archive
+    try: d.done(result)
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/databases/gsheet_db.py", line 94, in done
+    gw.batch_set_cell(cell_updates)
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/utils/gworksheet.py", line 104, in batch_set_cell
+    self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/worksheet.py", line 1361, in batch_update
+    response = self.client.values_batch_update(self.spreadsheet_id, body=body)
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/http_client.py", line 263, in values_batch_update
+    r = self.request("post", url, json=body)
+  File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/http_client.py", line 123, in request
+    raise APIError(response)
+gspread.exceptions.APIError: {'code': 400, 'message': 'Invalid data[4]: Your input contains more than the maximum of 50000 characters in a single cell.', 'status': 'INVALID_ARGUMENT'}