Compare commits

...

10 Commits

Author SHA1 Message Date
msramalho
65e3c99483 Bump version to v0.5.28 for release 2023-07-26 16:13:14 +01:00
msramalho
888ad8f004 fix: twitter hack videos extension detection 2023-07-26 16:12:56 +01:00
msramalho
086a9e6c84 fix: remove unnecessary log 2023-07-11 12:17:15 +01:00
msramalho
4d80ee6f02 Bump version to v0.5.27 for release 2023-07-11 12:16:06 +01:00
msramalho
92569ae6be fix: telegram archiver was outdated for images 2023-07-11 12:15:56 +01:00
msramalho
abaf86c776 Bump version to v0.5.26 for release 2023-07-02 18:42:59 +02:00
msramalho
8005a1955a fixes #82 twitter api walls 2023-07-02 18:42:43 +02:00
msramalho
b7889a182d readme update 2023-06-26 18:18:46 +01:00
msramalho
04f827f183 Bump version to v0.5.25 for release 2023-06-26 18:15:45 +01:00
msramalho
485901da3c security update 2023-06-26 18:15:19 +01:00
9 changed files with 69 additions and 46 deletions

View File

@@ -28,9 +28,9 @@ python-twitter-v2 = "*"
instaloader = "*" instaloader = "*"
tqdm = "*" tqdm = "*"
jinja2 = "*" jinja2 = "*"
cryptography = "==38.0.4" cryptography = "*"
dataclasses-json = "*" dataclasses-json = "*"
yt-dlp = ">=2023.2.17" yt-dlp = "*"
vk-url-scraper = "*" vk-url-scraper = "*"
uwsgi = "*" uwsgi = "*"
requests = {extras = ["socks"], version = "*"} requests = {extras = ["socks"], version = "*"}

49
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "51cddf6af480a820817a32ca7e6a982d669311d9883b5f9c0f1c49b13d42594e" "sha256": "84ebe4378c02b26d0663f6d7ede49064ec7428dddca668c8d8a5d64cf9191f09"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@@ -474,35 +474,28 @@
}, },
"cryptography": { "cryptography": {
"hashes": [ "hashes": [
"sha256:0e70da4bdff7601b0ef48e6348339e490ebfb0cbe638e083c9c41fb49f00c8bd", "sha256:059e348f9a3c1950937e1b5d7ba1f8e968508ab181e75fc32b879452f08356db",
"sha256:10652dd7282de17990b88679cb82f832752c4e8237f0c714be518044269415db", "sha256:1a5472d40c8f8e91ff7a3d8ac6dfa363d8e3138b961529c996f3e2df0c7a411a",
"sha256:175c1a818b87c9ac80bb7377f5520b7f31b3ef2a0004e2420319beadedb67290", "sha256:1a8e6c2de6fbbcc5e14fd27fb24414507cb3333198ea9ab1258d916f00bc3039",
"sha256:1d7e632804a248103b60b16fb145e8df0bc60eed790ece0d12efe8cd3f3e7744", "sha256:1fee5aacc7367487b4e22484d3c7e547992ed726d14864ee33c0176ae43b0d7c",
"sha256:1f13ddda26a04c06eb57119caf27a524ccae20533729f4b1e4a69b54e07035eb", "sha256:5d092fdfedaec4cbbffbf98cddc915ba145313a6fdaab83c6e67f4e6c218e6f3",
"sha256:2ec2a8714dd005949d4019195d72abed84198d877112abb5a27740e217e0ea8d", "sha256:5f0ff6e18d13a3de56f609dd1fd11470918f770c6bd5d00d632076c727d35485",
"sha256:2fa36a7b2cc0998a3a4d5af26ccb6273f3df133d61da2ba13b3286261e7efb70", "sha256:7bfc55a5eae8b86a287747053140ba221afc65eb06207bedf6e019b8934b477c",
"sha256:2fb481682873035600b5502f0015b664abc26466153fab5c6bc92c1ea69d478b", "sha256:7fa01527046ca5facdf973eef2535a27fec4cb651e4daec4d043ef63f6ecd4ca",
"sha256:3178d46f363d4549b9a76264f41c6948752183b3f587666aff0555ac50fd7876", "sha256:8dde71c4169ec5ccc1087bb7521d54251c016f126f922ab2dfe6649170a3b8c5",
"sha256:4367da5705922cf7070462e964f66e4ac24162e22ab0a2e9d31f1b270dd78083", "sha256:8f4ab7021127a9b4323537300a2acfb450124b2def3756f64dc3a3d2160ee4b5",
"sha256:4eb85075437f0b1fd8cd66c688469a0c4119e0ba855e3fef86691971b887caf6", "sha256:948224d76c4b6457349d47c0c98657557f429b4e93057cf5a2f71d603e2fc3a3",
"sha256:50a1494ed0c3f5b4d07650a68cd6ca62efe8b596ce743a5c94403e6f11bf06c1", "sha256:9a6c7a3c87d595608a39980ebaa04d5a37f94024c9f24eb7d10262b92f739ddb",
"sha256:53049f3379ef05182864d13bb9686657659407148f901f3f1eee57a733fb4b00", "sha256:b46e37db3cc267b4dea1f56da7346c9727e1209aa98487179ee8ebed09d21e43",
"sha256:6391e59ebe7c62d9902c24a4d8bcbc79a68e7c4ab65863536127c8a9cd94043b", "sha256:b4ceb5324b998ce2003bc17d519080b4ec8d5b7b70794cbd2836101406a9be31",
"sha256:67461b5ebca2e4c2ab991733f8ab637a7265bb582f07c7c88914b5afb88cb95b", "sha256:cb33ccf15e89f7ed89b235cff9d49e2e62c6c981a6061c9c8bb47ed7951190bc",
"sha256:78e47e28ddc4ace41dd38c42e6feecfdadf9c3be2af389abbfeef1ff06822285", "sha256:d198820aba55660b4d74f7b5fd1f17db3aa5eb3e6893b0a41b75e84e4f9e0e4b",
"sha256:80ca53981ceeb3241998443c4964a387771588c4e4a5d92735a493af868294f9", "sha256:d34579085401d3f49762d2f7d6634d6b6c2ae1242202e860f4d26b046e3a1006",
"sha256:8a4b2bdb68a447fadebfd7d24855758fe2d6fecc7fed0b78d190b1af39a8e3b0", "sha256:eb8163f5e549a22888c18b0d53d6bb62a20510060a22fd5a995ec8a05268df8a",
"sha256:8e45653fb97eb2f20b8c96f9cd2b3a0654d742b47d638cf2897afbd97f80fa6d", "sha256:f73bff05db2a3e5974a6fd248af2566134d8981fd7ab012e5dd4ddb1d9a70699"
"sha256:998cd19189d8a747b226d24c0207fdaa1e6658a1d3f2494541cb9dfbf7dcb6d2",
"sha256:a10498349d4c8eab7357a8f9aa3463791292845b79597ad1b98a543686fb1ec8",
"sha256:b4cad0cea995af760f82820ab4ca54e5471fc782f70a007f31531957f43e9dee",
"sha256:bfe6472507986613dc6cc00b3d492b2f7564b02b3b3682d25ca7f40fa3fd321b",
"sha256:c9e0d79ee4c56d841bd4ac6e7697c8ff3c8d6da67379057f29e66acffcd1e9a7",
"sha256:ca57eb3ddaccd1112c18fc80abe41db443cc2e9dcb1917078e02dfa010a4f353",
"sha256:ce127dd0a6a0811c251a6cddd014d292728484e530d80e872ad9806cfb1c5b3c"
], ],
"index": "pypi", "index": "pypi",
"version": "==38.0.4" "version": "==41.0.1"
}, },
"dataclasses-json": { "dataclasses-json": {
"hashes": [ "hashes": [

View File

@@ -197,7 +197,8 @@ Outputs:
* **Title**: Post title * **Title**: Post title
* **Text**: Post text * **Text**: Post text
* **Screenshot**: Link to screenshot of post * **Screenshot**: Link to screenshot of post
* **Hash**: Hash of archived HTML file (which contains hashes of post media) * **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification
* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content
* **WACZ**: Link to a WACZ web archive of post * **WACZ**: Link to a WACZ web archive of post
* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive * **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
@@ -228,7 +229,7 @@ Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run fro
#### Docker development #### Docker development
working with docker locally: working with docker locally:
* `docker build . -t auto-archiver` to build a local image * `docker build . -t auto-archiver` to build a local image
* `docker run --rm -v $PWD/secrets:/app/secrets auto-archiver pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml` * `docker run --rm -v $PWD/secrets:/app/secrets auto-archiver --config secrets/orchestration.yaml`
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive` * to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`

View File

@@ -18,7 +18,7 @@ steps:
# - thumbnail_enricher # - thumbnail_enricher
# - wayback_archiver_enricher # - wayback_archiver_enricher
# - wacz_enricher # - wacz_enricher
# - pdq_hash_enricher # - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher
formatter: html_formatter # defaults to mute_formatter formatter: html_formatter # defaults to mute_formatter
storages: storages:
- local_storage - local_storage

View File

@@ -48,7 +48,7 @@ class TelegramArchiver(Archiver):
video = s.find("video") video = s.find("video")
if video is None: if video is None:
logger.warning("could not find video") logger.warning("could not find video")
image_tags = s.find_all(class_="js-message_photo") image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
image_urls = [] image_urls = []
for im in image_tags: for im in image_tags:

View File

@@ -6,6 +6,7 @@ from slugify import slugify
from . import Archiver from . import Archiver
from ..core import Metadata, Media from ..core import Metadata, Media
from ..utils.misc import remove_get_parameters
class TwitterArchiver(Archiver): class TwitterArchiver(Archiver):
@@ -90,20 +91,36 @@ class TwitterArchiver(Archiver):
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata: def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
""" """
CURRENTLY STOPPED WORKING Hack alternative working again.
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
""" """
return False headers = {
# https://stackoverflow.com/a/71867055/6196010 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Origin": "https://platform.twitter.com",
"Connection": "keep-alive",
"Referer": "https://platform.twitter.com/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "cross-site",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"TE": "trailers"
}
logger.debug(f"Trying twitter hack for {url=}") logger.debug(f"Trying twitter hack for {url=}")
result = Metadata() result = Metadata()
hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}" hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
r = requests.get(hack_url) r = requests.get(hack_url)
if r.status_code != 200: return False if r.status_code != 200: return False
tweet = r.json() tweet = r.json()
urls = [] urls = []
for p in tweet["photos"]: for p in tweet.get("photos", []):
urls.append(p["url"]) urls.append(p["url"])
# 1 tweet has 1 video max # 1 tweet has 1 video max
@@ -113,14 +130,18 @@ class TwitterArchiver(Archiver):
logger.debug(f"Twitter hack got {urls=}") logger.debug(f"Twitter hack got {urls=}")
for u in urls: for i, u in enumerate(urls):
media = Media() media = Media(filename="")
media.set("src", u) media.set("src", u)
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}', item) ext = ""
if (mtype := mimetypes.guess_type(remove_get_parameters(u))[0]):
ext = mimetypes.guess_extension(mtype)
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
result.add_media(media) result.add_media(media)
result.set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")) result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result return result.success("twitter-hack")
def get_username_tweet_id(self, url): def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle

View File

@@ -67,7 +67,7 @@ class GsheetsDb(Database):
batch_if_valid('title', item.get_title()) batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")) batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp()) batch_if_valid('timestamp', item.get_timestamp())
batch_if_valid('hash', media.get("hash", "not-calculated")) if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
# merge all pdq hashes into a single string, if present # merge all pdq hashes into a single string, if present
pdq_hashes = [] pdq_hashes = []

View File

@@ -2,6 +2,7 @@
import os, json, requests import os, json, requests
from datetime import datetime from datetime import datetime
from loguru import logger from loguru import logger
from urllib.parse import urlparse, urlunparse
def mkdir_if_not_exists(folder): def mkdir_if_not_exists(folder):
@@ -20,6 +21,13 @@ def expand_url(url):
logger.error(f'Failed to expand url {url}') logger.error(f'Failed to expand url {url}')
return url return url
def remove_get_parameters(url):
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
# useful for mimetypes to work
parsed_url = urlparse(url)
new_url = urlunparse(parsed_url._replace(query=''))
return new_url
def getattr_or(o: object, prop: str, default=None): def getattr_or(o: object, prop: str, default=None):
try: try:

View File

@@ -3,7 +3,7 @@ _MAJOR = "0"
_MINOR = "5" _MINOR = "5"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "24" _PATCH = "28"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""