Bump version to v0.5.27 for release

fix: telegram archiver was outdated for images
Bump version to v0.5.26 for release
2026-06-12 21:28:29 +03:00 · 2023-07-11 12:16:06 +01:00 · 2023-07-11 12:15:56 +01:00 · 2023-07-02 18:42:59 +02:00 · 2023-07-02 18:42:43 +02:00 · 2023-06-26 18:18:46 +01:00
7 changed files with 38 additions and 16 deletions
--- a/2
+++ b/2
@@ -30,7 +30,7 @@ tqdm = "*"
 jinja2 = "*"
 cryptography = "*"
 dataclasses-json = "*"
-yt-dlp = ">=2023.2.17"
+yt-dlp = "*"
 vk-url-scraper = "*"
 uwsgi = "*"
 requests = {extras = ["socks"], version = "*"}
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "6e76638769e56f28c2cc56e548d3ac1752b36db2160e23a865089c80e584dcba"
+            "sha256": "84ebe4378c02b26d0663f6d7ede49064ec7428dddca668c8d8a5d64cf9191f09"
        },
        "pipfile-spec": 6,
        "requires": {
--- a/README.md
+++ b/README.md
@@ -197,7 +197,8 @@ Outputs:
 * **Title**: Post title
 * **Text**: Post text
 * **Screenshot**: Link to screenshot of post
-* **Hash**: Hash of archived HTML file (which contains hashes of post media)
+* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification
 * **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content
 * **WACZ**: Link to a WACZ web archive of post
 * **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
--- a/src/auto_archiver/archivers/telegram_archiver.py
+++ b/src/auto_archiver/archivers/telegram_archiver.py
@@ -48,7 +48,8 @@ class TelegramArchiver(Archiver):
        video = s.find("video")
        if video is None:
            logger.warning("could not find video")
-            image_tags = s.find_all(class_="js-message_photo")
+            image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
            logger.info(image_tags)
            image_urls = []
            for im in image_tags:
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@@ -90,20 +90,36 @@ class TwitterArchiver(Archiver):
    def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
        """
-        CURRENTLY STOPPED WORKING
+        Hack alternative working again.
        https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
        https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
        next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
        """
-        return False
+        headers = {
-        # https://stackoverflow.com/a/71867055/6196010
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
            "Accept": "*/*",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate, br",
            "Origin": "https://platform.twitter.com",
            "Connection": "keep-alive",
            "Referer": "https://platform.twitter.com/",
            "Sec-Fetch-Dest": "empty",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Site": "cross-site",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
            "TE": "trailers"
        }
        logger.debug(f"Trying twitter hack for {url=}")
        result = Metadata()
-        hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}"
+        hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
        r = requests.get(hack_url)
        if r.status_code != 200: return False
        tweet = r.json()
        urls = []
-        for p in tweet["photos"]:
+        for p in tweet.get("photos", []):
            urls.append(p["url"])
        # 1 tweet has 1 video max
@@ -113,14 +129,18 @@ class TwitterArchiver(Archiver):
        logger.debug(f"Twitter hack got {urls=}")
-        for u in urls:
+        for i, u in enumerate(urls):
-            media = Media()
+            media = Media(filename="")
            media.set("src", u)
-            media.filename = self.download_from_url(u, f'{slugify(url)}_{i}', item)
+            ext = ""
            if (mtype := mimetypes.guess_type(u)[0]):
                ext = mimetypes.guess_extension(mtype)
            media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
            result.add_media(media)
-        result.set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
+        result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
-        return result
+        return result.success("twitter-hack")
    def get_username_tweet_id(self, url):
        # detect URLs that we definitely cannot handle
--- a/src/auto_archiver/databases/gsheet_db.py
+++ b/src/auto_archiver/databases/gsheet_db.py
@@ -67,7 +67,7 @@ class GsheetsDb(Database):
        batch_if_valid('title', item.get_title())
        batch_if_valid('text', item.get("content", ""))
        batch_if_valid('timestamp', item.get_timestamp())
-        batch_if_valid('hash', media.get("hash", "not-calculated"))
+        if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
        # merge all pdq hashes into a single string, if present
        pdq_hashes = []
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@@ -3,7 +3,7 @@ _MAJOR = "0"
 _MINOR = "5"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "25"
+_PATCH = "27"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""
Author	SHA1	Message	Date
msramalho	4d80ee6f02	Bump version to v0.5.27 for release	2023-07-11 12:16:06 +01:00
msramalho	92569ae6be	fix: telegram archiver was outdated for images	2023-07-11 12:15:56 +01:00
msramalho	abaf86c776	Bump version to v0.5.26 for release	2023-07-02 18:42:59 +02:00
msramalho	8005a1955a	fixes #82 twitter api walls	2023-07-02 18:42:43 +02:00
msramalho	b7889a182d	readme update	2023-06-26 18:18:46 +01:00