Compare commits

...

12 Commits

Author SHA1 Message Date
msramalho
b166d57e61 v0.12.0 bump 2024-08-21 13:34:34 +01:00
msramalho
11c3288267 closes #146 2024-08-21 13:33:58 +01:00
msramalho
004143a58a version bump v0.11.6 2024-07-18 11:27:39 +01:00
msramalho
686f0027c4 adds new entries to example orchestration file 2024-07-18 11:27:15 +01:00
dependabot[bot]
b03cf32c73 Bump authlib from 1.3.0 to 1.3.1 (#144)
Bumps [authlib](https://github.com/lepture/authlib) from 1.3.0 to 1.3.1.
- [Release notes](https://github.com/lepture/authlib/releases)
- [Changelog](https://github.com/lepture/authlib/blob/master/docs/changelog.rst)
- [Commits](https://github.com/lepture/authlib/compare/v1.3.0...v1.3.1)

---
updated-dependencies:
- dependency-name: authlib
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-07-18 11:26:22 +01:00
msramalho
dc9e64397e bumping yt-dlp 2024-07-18 11:23:09 +01:00
msramalho
c7bc5e2988 cleanup 2024-05-15 11:04:29 +01:00
msramalho
1e375bd740 version bump 2024-05-14 16:42:15 +01:00
Miguel Sozinho Ramalho
f8824691dd refactors free twitter archiver strategies (#142) 2024-05-14 16:23:33 +01:00
msramalho
012cc36609 removes deprecated datetime method 2024-05-14 15:54:50 +01:00
Miguel Sozinho Ramalho
7cfe1e39cc #135 fix cleanup of telethon session files (#139)
* closes #135

* version bump
2024-04-16 12:45:45 +01:00
Jett Chen
cf8691bad7 Add yt-dlp based archiving for TwitterArchiver (#138)
* Add ytdlp archiving capability

* Add type annotation

* version bump

---------

Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2024-04-15 19:54:55 +01:00
9 changed files with 138 additions and 81 deletions

107
Pipfile.lock generated
View File

@@ -147,11 +147,12 @@
}, },
"authlib": { "authlib": {
"hashes": [ "hashes": [
"sha256:959ea62a5b7b5123c5059758296122b57cd2585ae2ed1c0622c21b371ffdae06", "sha256:7ae843f03c06c5c0debd63c9db91f9fda64fa62a42a77419fa15fbb7e7a58917",
"sha256:9637e4de1fb498310a56900b3e2043a206b03cb11c05422014b0302cbc814be3" "sha256:d35800b973099bbadc49b42b256ecb80041ad56b7fe1216a362c7943c088f377"
], ],
"index": "pypi",
"markers": "python_version >= '3.8'", "markers": "python_version >= '3.8'",
"version": "==1.3.0" "version": "==1.3.1"
}, },
"beautifulsoup4": { "beautifulsoup4": {
"hashes": [ "hashes": [
@@ -273,7 +274,7 @@
"sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2", "sha256:fd5f17ff8f14003595ab414e45fce13d073e0762394f957182e69035c9f3d7c2",
"sha256:fdc3ff3bfccdc6b9cc7c342c03aa2400683f0cb891d46e94b64a197910dc4064" "sha256:fdc3ff3bfccdc6b9cc7c342c03aa2400683f0cb891d46e94b64a197910dc4064"
], ],
"markers": "platform_python_implementation >= 'CPython'", "markers": "implementation_name == 'cpython'",
"version": "==1.1.0" "version": "==1.1.0"
}, },
"bs4": { "bs4": {
@@ -294,11 +295,11 @@
}, },
"certifi": { "certifi": {
"hashes": [ "hashes": [
"sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f", "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b",
"sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1" "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"
], ],
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==2024.2.2" "version": "==2024.7.4"
}, },
"certvalidator": { "certvalidator": {
"hashes": [ "hashes": [
@@ -459,7 +460,7 @@
"sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519", "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519",
"sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561" "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"
], ],
"markers": "python_version >= '3.6'", "markers": "python_full_version >= '3.7.0'",
"version": "==3.3.2" "version": "==3.3.2"
}, },
"click": { "click": {
@@ -479,42 +480,42 @@
}, },
"cryptography": { "cryptography": {
"hashes": [ "hashes": [
"sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee", "sha256:013629ae70b40af70c9a7a5db40abe5d9054e6f4380e50ce769947b73bf3caad",
"sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576", "sha256:2346b911eb349ab547076f47f2e035fc8ff2c02380a7cbbf8d87114fa0f1c583",
"sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d", "sha256:2f66d9cd9147ee495a8374a45ca445819f8929a3efcd2e3df6428e46c3cbb10b",
"sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30", "sha256:2f88d197e66c65be5e42cd72e5c18afbfae3f741742070e3019ac8f4ac57262c",
"sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413", "sha256:31f721658a29331f895a5a54e7e82075554ccfb8b163a18719d342f5ffe5ecb1",
"sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb", "sha256:343728aac38decfdeecf55ecab3264b015be68fc2816ca800db649607aeee648",
"sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da", "sha256:5226d5d21ab681f432a9c1cf8b658c0cb02533eece706b155e5fbd8a0cdd3949",
"sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4", "sha256:57080dee41209e556a9a4ce60d229244f7a66ef52750f813bfbe18959770cfba",
"sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd", "sha256:5a94eccb2a81a309806027e1670a358b99b8fe8bfe9f8d329f27d72c094dde8c",
"sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc", "sha256:6b7c4f03ce01afd3b76cf69a5455caa9cfa3de8c8f493e0d3ab7d20611c8dae9",
"sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8", "sha256:7016f837e15b0a1c119d27ecd89b3515f01f90a8615ed5e9427e30d9cdbfed3d",
"sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1", "sha256:81884c4d096c272f00aeb1f11cf62ccd39763581645b0812e99a91505fa48e0c",
"sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc", "sha256:81d8a521705787afe7a18d5bfb47ea9d9cc068206270aad0b96a725022e18d2e",
"sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e", "sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2",
"sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8", "sha256:961e61cefdcb06e0c6d7e3a1b22ebe8b996eb2bf50614e89384be54c48c6b63d",
"sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940", "sha256:9c0c1716c8447ee7dbf08d6db2e5c41c688544c61074b54fc4564196f55c25a7",
"sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400", "sha256:a0608251135d0e03111152e41f0cc2392d1e74e35703960d4190b2e0f4ca9c70",
"sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7", "sha256:a0c5b2b0585b6af82d7e385f55a8bc568abff8923af147ee3c07bd8b42cda8b2",
"sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16", "sha256:ad803773e9df0b92e0a817d22fd8a3675493f690b96130a5e24f1b8fabbea9c7",
"sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278", "sha256:b297f90c5723d04bcc8265fc2a0f86d4ea2e0f7ab4b6994459548d3a6b992a14",
"sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74", "sha256:ba4f0a211697362e89ad822e667d8d340b4d8d55fae72cdd619389fb5912eefe",
"sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec", "sha256:c4783183f7cb757b73b2ae9aed6599b96338eb957233c58ca8f49a49cc32fd5e",
"sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1", "sha256:c9bb2ae11bfbab395bdd072985abde58ea9860ed84e59dbc0463a5d0159f5b71",
"sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2", "sha256:cafb92b2bc622cd1aa6a1dce4b93307792633f4c5fe1f46c6b97cf67073ec961",
"sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c", "sha256:d45b940883a03e19e944456a558b67a41160e367a719833c53de6911cabba2b7",
"sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922", "sha256:dc0fdf6787f37b1c6b08e6dfc892d9d068b5bdb671198c72072828b80bd5fe4c",
"sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a", "sha256:dea567d1b0e8bc5764b9443858b673b734100c2871dc93163f58c46a97a83d28",
"sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6", "sha256:dec9b018df185f08483f294cae6ccac29e7a6e0678996587363dc352dc65c842",
"sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1", "sha256:e3ec3672626e1b9e55afd0df6d774ff0e953452886e06e0f1eb7eb0c832e8902",
"sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e", "sha256:e599b53fd95357d92304510fb7bda8523ed1f79ca98dce2f43c115950aa78801",
"sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac", "sha256:fa76fbb7596cc5839320000cdd5d0955313696d9511debab7ee7278fc8b5c84a",
"sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7" "sha256:fff12c88a672ab9c9c1cf7b0c80e3ad9e2ebd9d828d955c126be4fd3e5578c9e"
], ],
"index": "pypi", "index": "pypi",
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==42.0.5" "version": "==42.0.8"
}, },
"dataclasses-json": { "dataclasses-json": {
"hashes": [ "hashes": [
@@ -1704,11 +1705,11 @@
"socks" "socks"
], ],
"hashes": [ "hashes": [
"sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f", "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760",
"sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1" "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.8'",
"version": "==2.31.0" "version": "==2.32.3"
}, },
"requests-oauthlib": { "requests-oauthlib": {
"hashes": [ "hashes": [
@@ -1905,11 +1906,11 @@
"socks" "socks"
], ],
"hashes": [ "hashes": [
"sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d", "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472",
"sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19" "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"
], ],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", "markers": "python_version >= '3.8'",
"version": "==2.2.1" "version": "==2.2.2"
}, },
"vk-api": { "vk-api": {
"hashes": [ "hashes": [
@@ -2010,7 +2011,7 @@
"sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8", "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8",
"sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7" "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"
], ],
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.8'",
"version": "==12.0" "version": "==12.0"
}, },
"werkzeug": { "werkzeug": {
@@ -2127,12 +2128,12 @@
}, },
"yt-dlp": { "yt-dlp": {
"hashes": [ "hashes": [
"sha256:7ee90572b4d313b582b99c89e4eccf779b57ff54edc331873c6b3fba77faa8b0", "sha256:2a59d9e65ef6dadb1ff318346d04403664c3fa395e098fcd0d7ad626ef9f8a89",
"sha256:d6ff6798bd114cc48763564fcb2f296464ec1604f731e69b07a8814c89b170a2" "sha256:f4614e1c710fcb387bf152d2162868c565ed3f675647ecaa19dab54e581780eb"
], ],
"index": "pypi", "index": "pypi",
"markers": "python_version >= '3.8'", "markers": "python_version >= '3.8'",
"version": "==2024.4.9" "version": "==2024.7.15.232803.dev0"
} }
}, },
"develop": { "develop": {

View File

@@ -47,7 +47,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
<details><summary><code>Python package instructions</code></summary> <details><summary><code>Python package instructions</code></summary>
1. make sure you have python 3.8 or higher installed 1. make sure you have python 3.10 or higher installed
2. install the package `pip/pipenv/conda install auto-archiver` 2. install the package `pip/pipenv/conda install auto-archiver`
3. test it's installed with `auto-archiver --help` 3. test it's installed with `auto-archiver --help`
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise 4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
@@ -108,7 +108,7 @@ configurations:
# ... configurations for the other steps here ... # ... configurations for the other steps here ...
``` ```
To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml). To see all available `steps` (which archivers, storages, databases, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do: All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:

View File

@@ -16,8 +16,13 @@ steps:
# - wacz_archiver_enricher # - wacz_archiver_enricher
enrichers: enrichers:
- hash_enricher - hash_enricher
# - meta_enricher
# - metadata_enricher # - metadata_enricher
# - screenshot_enricher # - screenshot_enricher
# - pdq_hash_enricher
# - ssl_enricher
# - timestamping_enricher
# - whisper_enricher
# - thumbnail_enricher # - thumbnail_enricher
# - wayback_archiver_enricher # - wayback_archiver_enricher
# - wacz_archiver_enricher # - wacz_archiver_enricher

View File

@@ -27,7 +27,7 @@ package_dir=
=src =src
packages=find: packages=find:
find_packages=true find_packages=true
python_requires = >=3.8 python_requires = >=3.10
[options.package_data] [options.package_data]
* = *.html * = *.html

View File

@@ -54,8 +54,9 @@ class InstagramTbotArchiver(Archiver):
def cleanup(self) -> None: def cleanup(self) -> None:
logger.info(f"CLEANUP {self.name}.") logger.info(f"CLEANUP {self.name}.")
if os.path.exists(self.session_file): session_file_name = self.session_file + ".session"
os.remove(self.session_file) if os.path.exists(session_file_name):
os.remove(session_file_name)
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
url = item.get_url() url = item.get_url()

View File

@@ -101,8 +101,9 @@ class TelethonArchiver(Archiver):
def cleanup(self) -> None: def cleanup(self) -> None:
logger.info(f"CLEANUP {self.name}.") logger.info(f"CLEANUP {self.name}.")
if os.path.exists(self.session_file): session_file_name = self.session_file + ".session"
os.remove(self.session_file) if os.path.exists(session_file_name):
os.remove(session_file_name)
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
""" """

View File

@@ -1,7 +1,10 @@
import re, requests, mimetypes, json import re, requests, mimetypes, json
from typing import Union
from datetime import datetime from datetime import datetime
from loguru import logger from loguru import logger
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from yt_dlp import YoutubeDL
from yt_dlp.extractor.twitter import TwitterIE
from slugify import slugify from slugify import slugify
from . import Archiver from . import Archiver
@@ -29,7 +32,7 @@ class TwitterArchiver(Archiver):
# expand URL if t.co and clean tracker GET params # expand URL if t.co and clean tracker GET params
if 'https://t.co/' in url: if 'https://t.co/' in url:
try: try:
r = requests.get(url) r = requests.get(url, timeout=30)
logger.debug(f'Expanded url {url} to {r.url}') logger.debug(f'Expanded url {url} to {r.url}')
url = r.url url = r.url
except: except:
@@ -43,19 +46,31 @@ class TwitterArchiver(Archiver):
can handle private/public channels can handle private/public channels
""" """
url = item.get_url() url = item.get_url()
# detect URLs that we definitely cannot handle
username, tweet_id = self.get_username_tweet_id(url) username, tweet_id = self.get_username_tweet_id(url)
if not username: return False if not username: return False
result = Metadata() strategies = [self.download_yt_dlp, self.download_snscrape, self.download_syndication]
for strategy in strategies:
logger.debug(f"Trying {strategy.__name__} for {url=}")
try:
result = strategy(item, url, tweet_id)
if result: return result
except Exception as ex:
logger.error(f"Failed to download {url} with {strategy.__name__}: {type(ex).__name__} occurred. args: {ex.args}")
logger.warning(f"No free strategy worked for {url}")
return False
def download_snscrape(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
scr = TwitterTweetScraper(tweet_id) scr = TwitterTweetScraper(tweet_id)
try: try:
tweet = next(scr.get_items()) tweet = next(scr.get_items())
except Exception as ex: except Exception as ex:
logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}") logger.warning(f"SNSCRAPE FAILED, can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
return self.download_alternative(item, url, tweet_id) return False
result = Metadata()
result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date) result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date)
if tweet.media is None: if tweet.media is None:
logger.debug(f'No media found, archiving tweet text only') logger.debug(f'No media found, archiving tweet text only')
@@ -85,7 +100,7 @@ class TwitterArchiver(Archiver):
return result.success("twitter-snscrape") return result.success("twitter-snscrape")
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata: def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
""" """
Hack alternative working again. Hack alternative working again.
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL) https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
@@ -93,12 +108,13 @@ class TwitterArchiver(Archiver):
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816 next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
""" """
logger.debug(f"Trying twitter hack for {url=}")
result = Metadata()
hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}" hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
r = requests.get(hack_url) r = requests.get(hack_url)
if r.status_code != 200: return False if r.status_code != 200 or r.json()=={}:
logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
return False
result = Metadata()
tweet = r.json() tweet = r.json()
urls = [] urls = []
@@ -108,7 +124,7 @@ class TwitterArchiver(Archiver):
# 1 tweet has 1 video max # 1 tweet has 1 video max
if "video" in tweet: if "video" in tweet:
v = tweet["video"] v = tweet["video"]
urls.append(self.choose_variant(v.get("variants", []))) urls.append(self.choose_variant(v.get("variants", []))['url'])
logger.debug(f"Twitter hack got {urls=}") logger.debug(f"Twitter hack got {urls=}")
@@ -124,7 +140,39 @@ class TwitterArchiver(Archiver):
result.add_media(media) result.add_media(media)
result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")) result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result.success("twitter-hack") return result.success("twitter-syndication")
def download_yt_dlp(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
downloader = YoutubeDL()
tie = TwitterIE(downloader)
tweet = tie._extract_status(tweet_id)
result = Metadata()
result\
.set_title(tweet.get('full_text', ''))\
.set_content(json.dumps(tweet, ensure_ascii=False))\
.set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
if not tweet.get("entities", {}).get("media"):
logger.debug('No media found, archiving tweet text only')
result.status = "twitter-ytdl"
return result
for i, tw_media in enumerate(tweet["entities"]["media"]):
media = Media(filename="")
mimetype = ""
if tw_media["type"] == "photo":
media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
mimetype = "image/jpeg"
elif tw_media["type"] == "video":
variant = self.choose_variant(tw_media['video_info']['variants'])
media.set("src", variant['url'])
mimetype = variant['content_type']
elif tw_media["type"] == "animated_gif":
variant = tw_media['video_info']['variants'][0]
media.set("src", variant['url'])
mimetype = variant['content_type']
ext = mimetypes.guess_extension(mimetype)
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
result.add_media(media)
return result.success("twitter-ytdl")
def get_username_tweet_id(self, url): def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle
@@ -140,13 +188,13 @@ class TwitterArchiver(Archiver):
# choosing the highest quality possible # choosing the highest quality possible
variant, width, height = None, 0, 0 variant, width, height = None, 0, 0
for var in variants: for var in variants:
if var.get("type", "") == "video/mp4": if var.get("content_type", "") == "video/mp4":
width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"]) width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
if width_height: if width_height:
w, h = int(width_height[1]), int(width_height[2]) w, h = int(width_height[1]), int(width_height[2])
if w > width or h > height: if w > width or h > height:
width, height = w, h width, height = w, h
variant = var.get("src", variant) variant = var
else: else:
variant = var.get("src") if not variant else variant variant = var if not variant else variant
return variant return variant

View File

@@ -98,11 +98,12 @@ class YoutubeDLArchiver(Archiver):
result.set("comments", [{ result.set("comments", [{
"text": c["text"], "text": c["text"],
"author": c["author"], "author": c["author"],
"timestamp": datetime.datetime.utcfromtimestamp(c.get("timestamp")).replace(tzinfo=datetime.timezone.utc) "timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
} for c in info.get("comments", [])]) } for c in info.get("comments", [])])
if (timestamp := info.get("timestamp")): if (timestamp := info.get("timestamp")):
timestamp = datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=datetime.timezone.utc).isoformat() #TODO: fix deprecated timestamp,
timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
result.set_timestamp(timestamp) result.set_timestamp(timestamp)
if (upload_date := info.get("upload_date")): if (upload_date := info.get("upload_date")):
upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc) upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)

View File

@@ -1,9 +1,9 @@
_MAJOR = "0" _MAJOR = "0"
_MINOR = "11" _MINOR = "12"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "1" _PATCH = "0"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""