Compare commits

..

10 Commits

Author SHA1 Message Date
msramalho
65e3c99483 Bump version to v0.5.28 for release 2023-07-26 16:13:14 +01:00
msramalho
888ad8f004 fix: twitter hack videos extension detection 2023-07-26 16:12:56 +01:00
msramalho
086a9e6c84 fix: remove unnecessary log 2023-07-11 12:17:15 +01:00
msramalho
4d80ee6f02 Bump version to v0.5.27 for release 2023-07-11 12:16:06 +01:00
msramalho
92569ae6be fix: telegram archiver was outdated for images 2023-07-11 12:15:56 +01:00
msramalho
abaf86c776 Bump version to v0.5.26 for release 2023-07-02 18:42:59 +02:00
msramalho
8005a1955a fixes #82 twitter api walls 2023-07-02 18:42:43 +02:00
msramalho
b7889a182d readme update 2023-06-26 18:18:46 +01:00
msramalho
04f827f183 Bump version to v0.5.25 for release 2023-06-26 18:15:45 +01:00
msramalho
485901da3c security update 2023-06-26 18:15:19 +01:00
9 changed files with 69 additions and 46 deletions

View File

@@ -28,9 +28,9 @@ python-twitter-v2 = "*"
instaloader = "*"
tqdm = "*"
jinja2 = "*"
cryptography = "==38.0.4"
cryptography = "*"
dataclasses-json = "*"
yt-dlp = ">=2023.2.17"
yt-dlp = "*"
vk-url-scraper = "*"
uwsgi = "*"
requests = {extras = ["socks"], version = "*"}

49
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "51cddf6af480a820817a32ca7e6a982d669311d9883b5f9c0f1c49b13d42594e"
"sha256": "84ebe4378c02b26d0663f6d7ede49064ec7428dddca668c8d8a5d64cf9191f09"
},
"pipfile-spec": 6,
"requires": {
@@ -474,35 +474,28 @@
},
"cryptography": {
"hashes": [
"sha256:0e70da4bdff7601b0ef48e6348339e490ebfb0cbe638e083c9c41fb49f00c8bd",
"sha256:10652dd7282de17990b88679cb82f832752c4e8237f0c714be518044269415db",
"sha256:175c1a818b87c9ac80bb7377f5520b7f31b3ef2a0004e2420319beadedb67290",
"sha256:1d7e632804a248103b60b16fb145e8df0bc60eed790ece0d12efe8cd3f3e7744",
"sha256:1f13ddda26a04c06eb57119caf27a524ccae20533729f4b1e4a69b54e07035eb",
"sha256:2ec2a8714dd005949d4019195d72abed84198d877112abb5a27740e217e0ea8d",
"sha256:2fa36a7b2cc0998a3a4d5af26ccb6273f3df133d61da2ba13b3286261e7efb70",
"sha256:2fb481682873035600b5502f0015b664abc26466153fab5c6bc92c1ea69d478b",
"sha256:3178d46f363d4549b9a76264f41c6948752183b3f587666aff0555ac50fd7876",
"sha256:4367da5705922cf7070462e964f66e4ac24162e22ab0a2e9d31f1b270dd78083",
"sha256:4eb85075437f0b1fd8cd66c688469a0c4119e0ba855e3fef86691971b887caf6",
"sha256:50a1494ed0c3f5b4d07650a68cd6ca62efe8b596ce743a5c94403e6f11bf06c1",
"sha256:53049f3379ef05182864d13bb9686657659407148f901f3f1eee57a733fb4b00",
"sha256:6391e59ebe7c62d9902c24a4d8bcbc79a68e7c4ab65863536127c8a9cd94043b",
"sha256:67461b5ebca2e4c2ab991733f8ab637a7265bb582f07c7c88914b5afb88cb95b",
"sha256:78e47e28ddc4ace41dd38c42e6feecfdadf9c3be2af389abbfeef1ff06822285",
"sha256:80ca53981ceeb3241998443c4964a387771588c4e4a5d92735a493af868294f9",
"sha256:8a4b2bdb68a447fadebfd7d24855758fe2d6fecc7fed0b78d190b1af39a8e3b0",
"sha256:8e45653fb97eb2f20b8c96f9cd2b3a0654d742b47d638cf2897afbd97f80fa6d",
"sha256:998cd19189d8a747b226d24c0207fdaa1e6658a1d3f2494541cb9dfbf7dcb6d2",
"sha256:a10498349d4c8eab7357a8f9aa3463791292845b79597ad1b98a543686fb1ec8",
"sha256:b4cad0cea995af760f82820ab4ca54e5471fc782f70a007f31531957f43e9dee",
"sha256:bfe6472507986613dc6cc00b3d492b2f7564b02b3b3682d25ca7f40fa3fd321b",
"sha256:c9e0d79ee4c56d841bd4ac6e7697c8ff3c8d6da67379057f29e66acffcd1e9a7",
"sha256:ca57eb3ddaccd1112c18fc80abe41db443cc2e9dcb1917078e02dfa010a4f353",
"sha256:ce127dd0a6a0811c251a6cddd014d292728484e530d80e872ad9806cfb1c5b3c"
"sha256:059e348f9a3c1950937e1b5d7ba1f8e968508ab181e75fc32b879452f08356db",
"sha256:1a5472d40c8f8e91ff7a3d8ac6dfa363d8e3138b961529c996f3e2df0c7a411a",
"sha256:1a8e6c2de6fbbcc5e14fd27fb24414507cb3333198ea9ab1258d916f00bc3039",
"sha256:1fee5aacc7367487b4e22484d3c7e547992ed726d14864ee33c0176ae43b0d7c",
"sha256:5d092fdfedaec4cbbffbf98cddc915ba145313a6fdaab83c6e67f4e6c218e6f3",
"sha256:5f0ff6e18d13a3de56f609dd1fd11470918f770c6bd5d00d632076c727d35485",
"sha256:7bfc55a5eae8b86a287747053140ba221afc65eb06207bedf6e019b8934b477c",
"sha256:7fa01527046ca5facdf973eef2535a27fec4cb651e4daec4d043ef63f6ecd4ca",
"sha256:8dde71c4169ec5ccc1087bb7521d54251c016f126f922ab2dfe6649170a3b8c5",
"sha256:8f4ab7021127a9b4323537300a2acfb450124b2def3756f64dc3a3d2160ee4b5",
"sha256:948224d76c4b6457349d47c0c98657557f429b4e93057cf5a2f71d603e2fc3a3",
"sha256:9a6c7a3c87d595608a39980ebaa04d5a37f94024c9f24eb7d10262b92f739ddb",
"sha256:b46e37db3cc267b4dea1f56da7346c9727e1209aa98487179ee8ebed09d21e43",
"sha256:b4ceb5324b998ce2003bc17d519080b4ec8d5b7b70794cbd2836101406a9be31",
"sha256:cb33ccf15e89f7ed89b235cff9d49e2e62c6c981a6061c9c8bb47ed7951190bc",
"sha256:d198820aba55660b4d74f7b5fd1f17db3aa5eb3e6893b0a41b75e84e4f9e0e4b",
"sha256:d34579085401d3f49762d2f7d6634d6b6c2ae1242202e860f4d26b046e3a1006",
"sha256:eb8163f5e549a22888c18b0d53d6bb62a20510060a22fd5a995ec8a05268df8a",
"sha256:f73bff05db2a3e5974a6fd248af2566134d8981fd7ab012e5dd4ddb1d9a70699"
],
"index": "pypi",
"version": "==38.0.4"
"version": "==41.0.1"
},
"dataclasses-json": {
"hashes": [

View File

@@ -197,7 +197,8 @@ Outputs:
* **Title**: Post title
* **Text**: Post text
* **Screenshot**: Link to screenshot of post
* **Hash**: Hash of archived HTML file (which contains hashes of post media)
* **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification
* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content
* **WACZ**: Link to a WACZ web archive of post
* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive
@@ -228,7 +229,7 @@ Use `python -m src.auto_archiver --config secrets/orchestration.yaml` to run fro
#### Docker development
working with docker locally:
* `docker build . -t auto-archiver` to build a local image
* `docker run --rm -v $PWD/secrets:/app/secrets auto-archiver pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml`
* `docker run --rm -v $PWD/secrets:/app/secrets auto-archiver --config secrets/orchestration.yaml`
* to use local archive, also create a volume `-v` for it by adding `-v $PWD/local_archive:/app/local_archive`

View File

@@ -18,7 +18,7 @@ steps:
# - thumbnail_enricher
# - wayback_archiver_enricher
# - wacz_enricher
# - pdq_hash_enricher
# - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher
formatter: html_formatter # defaults to mute_formatter
storages:
- local_storage

View File

@@ -48,7 +48,7 @@ class TelegramArchiver(Archiver):
video = s.find("video")
if video is None:
logger.warning("could not find video")
image_tags = s.find_all(class_="js-message_photo")
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
image_urls = []
for im in image_tags:

View File

@@ -6,6 +6,7 @@ from slugify import slugify
from . import Archiver
from ..core import Metadata, Media
from ..utils.misc import remove_get_parameters
class TwitterArchiver(Archiver):
@@ -90,20 +91,36 @@ class TwitterArchiver(Archiver):
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
"""
CURRENTLY STOPPED WORKING
Hack alternative working again.
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
"""
return False
# https://stackoverflow.com/a/71867055/6196010
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Origin": "https://platform.twitter.com",
"Connection": "keep-alive",
"Referer": "https://platform.twitter.com/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "cross-site",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"TE": "trailers"
}
logger.debug(f"Trying twitter hack for {url=}")
result = Metadata()
hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}"
hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
r = requests.get(hack_url)
if r.status_code != 200: return False
tweet = r.json()
urls = []
for p in tweet["photos"]:
for p in tweet.get("photos", []):
urls.append(p["url"])
# 1 tweet has 1 video max
@@ -113,14 +130,18 @@ class TwitterArchiver(Archiver):
logger.debug(f"Twitter hack got {urls=}")
for u in urls:
media = Media()
for i, u in enumerate(urls):
media = Media(filename="")
media.set("src", u)
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}', item)
ext = ""
if (mtype := mimetypes.guess_type(remove_get_parameters(u))[0]):
ext = mimetypes.guess_extension(mtype)
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
result.add_media(media)
result.set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result
result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result.success("twitter-hack")
def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle

View File

@@ -67,7 +67,7 @@ class GsheetsDb(Database):
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp())
batch_if_valid('hash', media.get("hash", "not-calculated"))
if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
# merge all pdq hashes into a single string, if present
pdq_hashes = []

View File

@@ -2,6 +2,7 @@
import os, json, requests
from datetime import datetime
from loguru import logger
from urllib.parse import urlparse, urlunparse
def mkdir_if_not_exists(folder):
@@ -20,6 +21,13 @@ def expand_url(url):
logger.error(f'Failed to expand url {url}')
return url
def remove_get_parameters(url):
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
# useful for mimetypes to work
parsed_url = urlparse(url)
new_url = urlunparse(parsed_url._replace(query=''))
return new_url
def getattr_or(o: object, prop: str, default=None):
try:

View File

@@ -3,7 +3,7 @@ _MAJOR = "0"
_MINOR = "5"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "24"
_PATCH = "28"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""