mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
56526a9ac7 | ||
|
|
3a22cc28c0 | ||
|
|
dbb3dfa04f | ||
|
|
01bdb35f5d | ||
|
|
43cbc6ac56 | ||
|
|
9c7cab1ae2 | ||
|
|
a9a0bae083 | ||
|
|
97d133ce79 | ||
|
|
432ee3dcfd | ||
|
|
794b4f6052 | ||
|
|
965d7d41dd | ||
|
|
e73faa70cc | ||
|
|
80beab9f23 | ||
|
|
200cea4e12 | ||
|
|
1256fde159 | ||
|
|
65e222e177 | ||
|
|
f2eb9ef784 | ||
|
|
2081c16555 | ||
|
|
d3efd7121c | ||
|
|
9d3cd5774b |
@@ -21,7 +21,7 @@ This allows you to run the auto-archiver without the `poetry run` prefix.
|
|||||||
### Optional Development Packages
|
### Optional Development Packages
|
||||||
|
|
||||||
Install development packages (used for unit tests etc.) using:
|
Install development packages (used for unit tests etc.) using:
|
||||||
`poetry install -with dev`
|
`poetry install --with dev`
|
||||||
|
|
||||||
|
|
||||||
```{toctree}
|
```{toctree}
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ Note not all warnings can be fixed automatically.
|
|||||||
|
|
||||||
Most fixes are safe, but some non-standard practices such as dynamic loading are not picked up by linters. Ensure you check any modifications by this before committing them.
|
Most fixes are safe, but some non-standard practices such as dynamic loading are not picked up by linters. Ensure you check any modifications by this before committing them.
|
||||||
```shell
|
```shell
|
||||||
make ruff-fix
|
make ruff-clean
|
||||||
```
|
```
|
||||||
|
|
||||||
**Changing Configurations ⚙️**
|
**Changing Configurations ⚙️**
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
|
|
||||||
## Running Tests
|
## Running Tests
|
||||||
|
|
||||||
1. Make sure you've installed the dev dependencies with `pytest install --with dev`
|
1. Make sure you've installed the dev dependencies with `poetry install --with dev`
|
||||||
2. Tests can be run as follows:
|
2. Tests can be run as follows:
|
||||||
```{code} bash
|
```{code} bash
|
||||||
#### Command prefix of 'poetry run' removed here for simplicity
|
#### Command prefix of 'poetry run' removed here for simplicity
|
||||||
@@ -26,7 +26,7 @@ pytest -ra -v tests/test_file.py
|
|||||||
pytest -ra -v tests/test_file.py::test_function_name
|
pytest -ra -v tests/test_file.py::test_function_name
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Some tests require environment variables to be set. You can use the example `.env.test.example` file as a template. Copy it to `.env.test` and fill in the required values. This file will be loaded automatically by `pytest`.
|
3. Some tests require environment variables to be set. You can use the example `tests/.env.test.example` file as a template. Copy it to `tests/.env.test` and fill in the required values. This file will be loaded automatically by `pytest`.
|
||||||
```{code} bash
|
```{code} bash
|
||||||
cp .env.test.example .env.test
|
cp tests/.env.test.example tests/.env.test
|
||||||
```
|
```
|
||||||
2117
poetry.lock
generated
2117
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "auto-archiver"
|
name = "auto-archiver"
|
||||||
version = "1.1.2"
|
version = "1.1.6"
|
||||||
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
|
||||||
|
|
||||||
requires-python = ">=3.10,<3.13"
|
requires-python = ">=3.10,<3.13"
|
||||||
@@ -50,8 +50,8 @@ dependencies = [
|
|||||||
"retrying (>=0.0.0)",
|
"retrying (>=0.0.0)",
|
||||||
"rich-argparse (>=1.6.0,<2.0.0)",
|
"rich-argparse (>=1.6.0,<2.0.0)",
|
||||||
"ruamel-yaml (>=0.18.10,<0.19.0)",
|
"ruamel-yaml (>=0.18.10,<0.19.0)",
|
||||||
"rfc3161-client (==1.0.3)",
|
"rfc3161-client (>=1.0.5)",
|
||||||
"cryptography (>44.0.1,<45.0.0)",
|
"cryptography (>=46.0.3)",
|
||||||
"opentimestamps (>=0.4.5,<0.5.0)",
|
"opentimestamps (>=0.4.5,<0.5.0)",
|
||||||
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
"bgutil-ytdlp-pot-provider (>=1.0.0)",
|
||||||
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
"yt-dlp[curl-cffi,default] (>=2025.5.22,<2026.0.0)",
|
||||||
|
|||||||
@@ -97,7 +97,10 @@ class AntibotExtractorEnricher(Extractor, Enricher):
|
|||||||
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
|
sb.uc_gui_click_rc() # NB: using handle instead of click breaks some sites like reddit, for now we separate here but can have dropins deciding this in the future
|
||||||
|
|
||||||
dropin = self._get_suitable_dropin(url, sb)
|
dropin = self._get_suitable_dropin(url, sb)
|
||||||
dropin.open_page(url)
|
if not dropin.open_page(url):
|
||||||
|
# TODO: could we detect deleted videos?
|
||||||
|
logger.warning("Failed to open drop-in page")
|
||||||
|
return False
|
||||||
|
|
||||||
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
if self.detect_auth_wall and (dropin.hit_auth_wall() and self._hit_auth_wall(sb)):
|
||||||
logger.warning("Skipping since auth wall or CAPTCHA was detected")
|
logger.warning("Skipping since auth wall or CAPTCHA was detected")
|
||||||
|
|||||||
@@ -1,17 +1,20 @@
|
|||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from typing import Mapping
|
from typing import Mapping
|
||||||
|
|
||||||
|
from auto_archiver.utils.custom_logger import logger
|
||||||
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
from auto_archiver.modules.antibot_extractor_enricher.dropin import Dropin
|
||||||
|
|
||||||
|
|
||||||
class TikTokDropin(Dropin):
|
class TikTokDropin(Dropin):
|
||||||
"""
|
"""
|
||||||
A class to handle TikTok drop-in functionality for the antibot extractor enricher module.
|
A class to handle TikTok drop-in functionality for the antibot extractor enricher module.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def documentation() -> Mapping[str, str]:
|
def documentation() -> Mapping[str, str]:
|
||||||
return {
|
return {
|
||||||
"name": "TikTok Dropin",
|
"name": "TikTok Dropin",
|
||||||
"description": "Handles TikTok posts and works without authentication.",
|
"description": "Handles TikTok posts and works without authentication.\nNOTE: This dropin is highly susceptible to TikTok's bot detection mechanisms and may not work reliably if you reuse the same IP. The GenericExtractor is recommended for TikTok posts, as it handles video/image download more reliable. In the future we plan to implement better anti captcha measures for this dropin.",
|
||||||
"site": "tiktok.com",
|
"site": "tiktok.com",
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -33,6 +36,9 @@ class TikTokDropin(Dropin):
|
|||||||
# TODO: implement login logic
|
# TODO: implement login logic
|
||||||
if url != self.sb.get_current_url():
|
if url != self.sb.get_current_url():
|
||||||
return False
|
return False
|
||||||
|
if self.sb.is_text_visible("Video currently unavailable"):
|
||||||
|
logger.debug("Video may have been removed or is private.")
|
||||||
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def hit_auth_wall(self) -> bool:
|
def hit_auth_wall(self) -> bool:
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import datetime
|
|||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import traceback
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
from typing import Generator, Type
|
from typing import Generator, Type
|
||||||
@@ -305,9 +306,9 @@ class GenericExtractor(Extractor):
|
|||||||
result.set_url(url)
|
result.set_url(url)
|
||||||
|
|
||||||
if "description" in video_data and not result.get("content"):
|
if "description" in video_data and not result.get("content"):
|
||||||
result.set_content(video_data.get("description"))
|
result.set_content(video_data.pop("description"))
|
||||||
# extract comments if enabled
|
# extract comments if enabled
|
||||||
if self.comments and video_data.get("comments", []) is not None:
|
if self.comments and video_data.get("comments", None) is not None:
|
||||||
result.set(
|
result.set(
|
||||||
"comments",
|
"comments",
|
||||||
[
|
[
|
||||||
@@ -406,9 +407,9 @@ class GenericExtractor(Extractor):
|
|||||||
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
logger.error(f"Error loading subtitle file {val.get('filepath')}: {e}")
|
||||||
result.add_media(new_media)
|
result.add_media(new_media)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing entry {entry}: {e}")
|
logger.error(f"Error processing entry {str(entry)[:256]}: {e} {traceback.format_exc()}")
|
||||||
if not len(result.media):
|
if not len(result.media):
|
||||||
logger.info(f"No media found for entry {entry}, skipping.")
|
logger.info(f"No media found for entry {str(entry)[:256]}, skipping.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return self.add_metadata(data, info_extractor, url, result)
|
return self.add_metadata(data, info_extractor, url, result)
|
||||||
@@ -516,7 +517,7 @@ class GenericExtractor(Extractor):
|
|||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if result:
|
if result and not result.is_success():
|
||||||
extractor_name = "yt-dlp"
|
extractor_name = "yt-dlp"
|
||||||
if info_extractor:
|
if info_extractor:
|
||||||
extractor_name += f"_{info_extractor.ie_key()}"
|
extractor_name += f"_{info_extractor.ie_key()}"
|
||||||
@@ -535,7 +536,6 @@ class GenericExtractor(Extractor):
|
|||||||
if url.startswith("https://ya.ru"):
|
if url.startswith("https://ya.ru"):
|
||||||
url = url.replace("https://ya.ru", "https://yandex.ru")
|
url = url.replace("https://ya.ru", "https://yandex.ru")
|
||||||
item.set("replaced_url", url)
|
item.set("replaced_url", url)
|
||||||
logger.debug(f"{skip_proxy=}, {self.proxy_on_failure_only=}, {self.proxy=}")
|
|
||||||
|
|
||||||
# proxy_on_failure_only logic
|
# proxy_on_failure_only logic
|
||||||
if self.proxy and self.proxy_on_failure_only and not skip_proxy:
|
if self.proxy and self.proxy_on_failure_only and not skip_proxy:
|
||||||
@@ -605,9 +605,9 @@ class GenericExtractor(Extractor):
|
|||||||
validated_options
|
validated_options
|
||||||
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||||
|
|
||||||
|
result: Metadata = None
|
||||||
for info_extractor in self.suitable_extractors(url):
|
for info_extractor in self.suitable_extractors(url):
|
||||||
result = self.download_for_extractor(info_extractor, url, ydl)
|
local_result: Metadata = self.download_for_extractor(info_extractor, url, ydl)
|
||||||
if result:
|
if local_result:
|
||||||
return result
|
result = result.merge(local_result) if result else local_result
|
||||||
|
return result if result else False
|
||||||
return False
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
import requests
|
import requests
|
||||||
from auto_archiver.utils.custom_logger import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
@@ -14,12 +15,16 @@ class Tiktok(GenericDropin):
|
|||||||
It's useful for capturing content that requires a login, like sensitive content.
|
It's useful for capturing content that requires a login, like sensitive content.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Regex pattern to match TikTok photo post URLs
|
||||||
|
PHOTO_URL_REGEX = r"https?://(?:www\.)?tiktok\.com/@[\w\.-]+/photo/\d+"
|
||||||
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
|
||||||
|
|
||||||
def suitable(self, url, info_extractor) -> bool:
|
def suitable(self, url, info_extractor) -> bool:
|
||||||
"""This dropin (which uses Tikvm) is suitable for *all* Tiktok type URLs - videos, lives, VMs, and users.
|
"""This dropin (which uses Tikvm) is suitable for *all* Tiktok type URLs - videos, lives, VMs, and users.
|
||||||
Return the 'suitable' method from the TikTokIE class."""
|
Return the 'suitable' method from the TikTokIE class."""
|
||||||
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
|
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE)) or (
|
||||||
|
re.match(self.PHOTO_URL_REGEX, url) is not None
|
||||||
|
)
|
||||||
|
|
||||||
def extract_post(self, url: str, ie_instance):
|
def extract_post(self, url: str, ie_instance):
|
||||||
logger.debug("Using Tikwm API to attempt to download tiktok video")
|
logger.debug("Using Tikwm API to attempt to download tiktok video")
|
||||||
@@ -28,56 +33,91 @@ class Tiktok(GenericDropin):
|
|||||||
|
|
||||||
r = requests.get(endpoint)
|
r = requests.get(endpoint)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise ValueError(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
|
raise ValueError(f"Unexpected status code '{r.status_code}' from tikwm.com")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
json_response = r.json()
|
json_response = r.json()
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise ValueError(f"failed to parse JSON response from tikwm.com for {url=}")
|
raise ValueError("Failed to parse JSON response from tikwm.com")
|
||||||
|
|
||||||
if not json_response.get("msg") == "success" or not (api_data := json_response.get("data", {})):
|
if not json_response.get("msg") == "success" or not (api_data := json_response.get("data", {})):
|
||||||
raise ValueError(f"failed to get a valid response from tikwm.com for {url=}: {repr(json_response)}")
|
raise ValueError(f"Unable to download with tikwm.com: {repr(json_response)}")
|
||||||
|
|
||||||
# tries to get the non-watermarked version first
|
# tries to get the non-watermarked version first
|
||||||
video_url = api_data.pop("play", api_data.pop("wmplay", None))
|
play_url = api_data.pop("play", api_data.pop("wmplay", None))
|
||||||
if not video_url:
|
if play_url and "mime_type=audio" in play_url:
|
||||||
raise ValueError(f"no valid video URL found in response from tikwm.com for {url=}")
|
play_url = None
|
||||||
|
if play_url:
|
||||||
api_data["video_url"] = video_url
|
api_data["video_url"] = play_url
|
||||||
return api_data
|
return api_data
|
||||||
|
|
||||||
def keys_to_clean(self, video_data: dict, info_extractor):
|
def keys_to_clean(self, video_data: dict, info_extractor):
|
||||||
return ["video_url", "title", "create_time", "author", "cover", "origin_cover", "ai_dynamic_cover", "duration"]
|
return [
|
||||||
|
"video_url",
|
||||||
|
"title",
|
||||||
|
"create_time",
|
||||||
|
"author",
|
||||||
|
"cover",
|
||||||
|
"origin_cover",
|
||||||
|
"ai_dynamic_cover",
|
||||||
|
"duration",
|
||||||
|
"size",
|
||||||
|
"wm_size",
|
||||||
|
"music",
|
||||||
|
"music_info",
|
||||||
|
"play_count",
|
||||||
|
"digg_count",
|
||||||
|
"comment_count",
|
||||||
|
"share_count",
|
||||||
|
"download_count",
|
||||||
|
"collect_count",
|
||||||
|
"anchors",
|
||||||
|
"anchors_extras",
|
||||||
|
"is_ad",
|
||||||
|
"commerce_info",
|
||||||
|
"commercial_video_info",
|
||||||
|
"item_comment_settings",
|
||||||
|
"mentioned_users",
|
||||||
|
] # all of these will be added via api_data in a single metadata field vs individual ones in the generic extractor
|
||||||
|
|
||||||
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
def create_metadata(self, post: dict, ie_instance, archiver, url):
|
||||||
# prepare result, start by downloading video
|
# prepare result, start by downloading video
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
video_url = post.pop("video_url")
|
is_success = False
|
||||||
|
|
||||||
# get the cover if possible
|
# get the cover if possible
|
||||||
cover_url = post.pop("origin_cover", post.pop("cover", post.pop("ai_dynamic_cover", None)))
|
cover_url = post.pop("origin_cover", post.pop("cover", post.pop("ai_dynamic_cover", None)))
|
||||||
if cover_url and (cover_downloaded := archiver.download_from_url(cover_url)):
|
if cover_url and (cover_downloaded := archiver.download_from_url(cover_url)):
|
||||||
result.add_media(Media(cover_downloaded))
|
result.add_media(Media(cover_downloaded))
|
||||||
|
|
||||||
# get the video or fail
|
for image_url in post.pop("images", []):
|
||||||
video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}")
|
if image_downloaded := archiver.download_from_url(image_url):
|
||||||
if not video_downloaded:
|
result.add_media(Media(image_downloaded))
|
||||||
logger.error("Failed to download video")
|
is_success = True # this is an images post and we got it/them
|
||||||
return False
|
|
||||||
video_media = Media(video_downloaded)
|
# get the video if present, could be an image post
|
||||||
if duration := post.get("duration", None):
|
if video_url := post.pop("video_url", None):
|
||||||
video_media.set("duration", duration)
|
video_downloaded = archiver.download_from_url(video_url, f"vid_{post.get('id', '')}")
|
||||||
result.add_media(video_media)
|
if not video_downloaded:
|
||||||
|
logger.error("Failed to download video")
|
||||||
|
return False
|
||||||
|
video_media = Media(video_downloaded)
|
||||||
|
if duration := post.pop("duration", None):
|
||||||
|
video_media.set("duration", duration)
|
||||||
|
result.add_media(video_media)
|
||||||
|
is_success = True # this is a video post and we got it
|
||||||
|
|
||||||
# add remaining metadata
|
# add remaining metadata
|
||||||
result.set_title(post.get("title", ""))
|
result.set_title(post.pop("title", ""))
|
||||||
|
|
||||||
if created_at := post.get("create_time", None):
|
if created_at := post.pop("create_time", None):
|
||||||
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
|
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
|
||||||
|
|
||||||
if author := post.get("author", None):
|
if author := post.pop("author", None):
|
||||||
result.set("author", author)
|
result.set("author", author)
|
||||||
|
|
||||||
result.set("api_data", post)
|
result.set("api_data", {k: v for k, v in post.items() if v})
|
||||||
|
if is_success:
|
||||||
|
result.success("yt-dlp_TikTok")
|
||||||
|
else:
|
||||||
|
raise ValueError("Unable to download any media from TikTok post, possibly deleted or private.")
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -4,12 +4,12 @@ from importlib.metadata import version
|
|||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
from retrying import retry
|
||||||
import requests
|
import requests
|
||||||
from auto_archiver.utils.custom_logger import logger
|
from auto_archiver.utils.custom_logger import logger
|
||||||
|
|
||||||
from rfc3161_client import (decode_timestamp_response,TimestampRequestBuilder,TimeStampResponse, VerifierBuilder)
|
from rfc3161_client import (decode_timestamp_response, TimestampRequestBuilder, TimeStampResponse, VerifierBuilder)
|
||||||
from rfc3161_client import VerificationError as Rfc3161VerificationError
|
from rfc3161_client import VerificationError as Rfc3161VerificationError
|
||||||
from rfc3161_client.base import HashAlgorithm
|
|
||||||
from rfc3161_client.tsp import SignedData
|
from rfc3161_client.tsp import SignedData
|
||||||
from cryptography import x509
|
from cryptography import x509
|
||||||
from cryptography.hazmat.primitives import serialization
|
from cryptography.hazmat.primitives import serialization
|
||||||
@@ -60,7 +60,6 @@ class TimestampingEnricher(Enricher):
|
|||||||
logger.debug(f"No hashes found")
|
logger.debug(f"No hashes found")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
hashes_fn = os.path.join(self.tmp_dir, "hashes.txt")
|
hashes_fn = os.path.join(self.tmp_dir, "hashes.txt")
|
||||||
|
|
||||||
data_to_sign = "\n".join(hashes)
|
data_to_sign = "\n".join(hashes)
|
||||||
@@ -113,7 +112,7 @@ class TimestampingEnricher(Enricher):
|
|||||||
f.write(timestamp_token)
|
f.write(timestamp_token)
|
||||||
return tst_path
|
return tst_path
|
||||||
|
|
||||||
def verify_signed(self, timestamp_response: TimeStampResponse, message: bytes) -> x509.Certificate:
|
def verify_signed(self, timestamp_response: TimeStampResponse, message: bytes) -> x509.Certificate:
|
||||||
"""
|
"""
|
||||||
Verify a Signed Timestamp Response is trusted by a known Certificate Authority.
|
Verify a Signed Timestamp Response is trusted by a known Certificate Authority.
|
||||||
|
|
||||||
@@ -158,7 +157,6 @@ class TimestampingEnricher(Enricher):
|
|||||||
|
|
||||||
verifier = builder.build()
|
verifier = builder.build()
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
verifier.verify(timestamp_response, message_hash)
|
verifier.verify(timestamp_response, message_hash)
|
||||||
return certificate
|
return certificate
|
||||||
@@ -171,18 +169,33 @@ class TimestampingEnricher(Enricher):
|
|||||||
# see https://github.com/sigstore/sigstore-python/blob/99948d5b80525a5a104e904ffea58169dc6e0629/sigstore/_internal/timestamp.py#L84-L121
|
# see https://github.com/sigstore/sigstore-python/blob/99948d5b80525a5a104e904ffea58169dc6e0629/sigstore/_internal/timestamp.py#L84-L121
|
||||||
|
|
||||||
timestamp_request = (
|
timestamp_request = (
|
||||||
TimestampRequestBuilder().data(bytes_data).nonce(nonce=True).build()
|
TimestampRequestBuilder().data(bytes_data).nonce(nonce=True).build()
|
||||||
)
|
)
|
||||||
try:
|
|
||||||
|
@retry(
|
||||||
|
wait_exponential_multiplier=1,
|
||||||
|
stop_max_attempt_number=2,
|
||||||
|
)
|
||||||
|
def sign_with_retry():
|
||||||
response = self.session.post(tsa_url, data=timestamp_request.as_bytes(), timeout=10)
|
response = self.session.post(tsa_url, data=timestamp_request.as_bytes(), timeout=10)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
return response
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = sign_with_retry()
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
logger.error(f"Error while sending request to {tsa_url=}: {e}")
|
logger.error(f"Error while sending request to {tsa_url=}: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
@retry(
|
||||||
|
wait_exponential_multiplier=1,
|
||||||
|
stop_max_attempt_number=2,
|
||||||
|
)
|
||||||
|
def decode_with_retry(response):
|
||||||
|
return decode_timestamp_response(response.content)
|
||||||
# Check that we can parse the response but do not *verify* it
|
# Check that we can parse the response but do not *verify* it
|
||||||
try:
|
try:
|
||||||
timestamp_response = decode_timestamp_response(response.content)
|
timestamp_response = decode_with_retry(response)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.error(f"Invalid timestamp response from server {tsa_url}: {e}")
|
logger.error(f"Invalid timestamp response from server {tsa_url}: {e}")
|
||||||
raise
|
raise
|
||||||
@@ -196,7 +209,7 @@ class TimestampingEnricher(Enricher):
|
|||||||
if len(certs) == 1:
|
if len(certs) == 1:
|
||||||
return certs
|
return certs
|
||||||
|
|
||||||
while(len(ordered_certs) < len(certs)):
|
while (len(ordered_certs) < len(certs)):
|
||||||
if len(ordered_certs) == 0:
|
if len(ordered_certs) == 0:
|
||||||
for cert in certs:
|
for cert in certs:
|
||||||
if not [c for c in certs if cert.subject == c.issuer]:
|
if not [c for c in certs if cert.subject == c.issuer]:
|
||||||
@@ -220,7 +233,7 @@ class TimestampingEnricher(Enricher):
|
|||||||
|
|
||||||
cert_chain = []
|
cert_chain = []
|
||||||
for i, cert in enumerate(certificates):
|
for i, cert in enumerate(certificates):
|
||||||
cert_fn = os.path.join(self.tmp_dir, f"{i+1} – {str(cert.serial_number)[:20]}.crt")
|
cert_fn = os.path.join(self.tmp_dir, f"{i + 1} – {str(cert.serial_number)[:20]}.crt")
|
||||||
with open(cert_fn, "wb") as f:
|
with open(cert_fn, "wb") as f:
|
||||||
f.write(cert.public_bytes(encoding=serialization.Encoding.PEM))
|
f.write(cert.public_bytes(encoding=serialization.Encoding.PEM))
|
||||||
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.get_attributes_for_oid(x509.NameOID.COMMON_NAME)[0].value))
|
cert_chain.append(Media(filename=cert_fn).set("subject", cert.subject.get_attributes_for_oid(x509.NameOID.COMMON_NAME)[0].value))
|
||||||
|
|||||||
@@ -5,6 +5,9 @@ from auto_archiver.modules.antibot_extractor_enricher.antibot_extractor_enricher
|
|||||||
from .test_extractor_base import TestExtractorBase
|
from .test_extractor_base import TestExtractorBase
|
||||||
|
|
||||||
|
|
||||||
|
CI = os.getenv("GITHUB_ACTIONS", "") == "true"
|
||||||
|
|
||||||
|
|
||||||
class DummySB:
|
class DummySB:
|
||||||
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
|
def __init__(self, url="", title="", visible_texts=None, visible_elements=None):
|
||||||
self._url = url
|
self._url = url
|
||||||
@@ -51,14 +54,15 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
|
|
||||||
@pytest.mark.download
|
@pytest.mark.download
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"url,in_title,in_text,image_count,video_count",
|
"url,in_title,in_text,image_count,video_count,skip_ci",
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
"https://en.wikipedia.org/wiki/Western_barn_owl",
|
||||||
"western barn owl",
|
"western barn owl",
|
||||||
"Tyto alba",
|
"Tyto alba",
|
||||||
5,
|
4,
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
"https://www.bellingcat.com/news/2025/04/29/open-sources-show-myanmar-junta-airstrike-damages-despite-post-earthquake-ceasefire/",
|
||||||
@@ -66,6 +70,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"Bellingcat has geolocated",
|
"Bellingcat has geolocated",
|
||||||
5,
|
5,
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
"https://www.bellingcat.com/news/2025/03/27/gaza-israel-palestine-shot-killed-injured-destroyed-dangerous-drone-journalists-in-gaza/",
|
||||||
@@ -73,6 +78,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"continued the work of Gazan journalists",
|
"continued the work of Gazan journalists",
|
||||||
5,
|
5,
|
||||||
1,
|
1,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.bellingcat.com/about/general-information",
|
"https://www.bellingcat.com/about/general-information",
|
||||||
@@ -80,6 +86,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"Stichting Bellingcat",
|
"Stichting Bellingcat",
|
||||||
0, # SVGs are ignored
|
0, # SVGs are ignored
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
|
"https://vk.com/wikipedia?from=search&w=wall-36156673_20451",
|
||||||
@@ -87,6 +94,7 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"16 сентября 1985 года лейблом EMI Records.",
|
"16 сентября 1985 года лейблом EMI Records.",
|
||||||
5,
|
5,
|
||||||
0,
|
0,
|
||||||
|
False,
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
"https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
|
"https://www.tiktok.com/@tracy_2424/photo/7418200173953830162",
|
||||||
@@ -94,13 +102,19 @@ class TestAntibotExtractorEnricher(TestExtractorBase):
|
|||||||
"Dito ko lang",
|
"Dito ko lang",
|
||||||
1,
|
1,
|
||||||
0,
|
0,
|
||||||
|
True,
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_download_pages_with_media(self, setup_module, make_item, url, in_title, in_text, image_count, video_count):
|
def test_download_pages_with_media(
|
||||||
|
self, setup_module, make_item, url, in_title, in_text, image_count, video_count, skip_ci
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Test downloading pages with media.
|
Test downloading pages with media.
|
||||||
"""
|
"""
|
||||||
|
if CI and skip_ci:
|
||||||
|
pytest.skip("Skipping test in CI environment")
|
||||||
|
|
||||||
self.extractor = setup_module(
|
self.extractor = setup_module(
|
||||||
self.extractor_module,
|
self.extractor_module,
|
||||||
self.config
|
self.config
|
||||||
|
|||||||
@@ -48,8 +48,6 @@ class TestGenericExtractor(TestExtractorBase):
|
|||||||
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
|
||||||
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
|
||||||
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
|
||||||
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
|
|
||||||
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_suitable_extractors(self, url, suitable_extractors):
|
def test_suitable_extractors(self, url, suitable_extractors):
|
||||||
@@ -148,6 +146,7 @@ class TestGenericExtractor(TestExtractorBase):
|
|||||||
def test_bluesky_download_video(self, make_item):
|
def test_bluesky_download_video(self, make_item):
|
||||||
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
item = make_item("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i")
|
||||||
result = self.extractor.download(item)
|
result = self.extractor.download(item)
|
||||||
|
assert result.get_url() == "https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i"
|
||||||
assert result is not False
|
assert result is not False
|
||||||
|
|
||||||
@pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")
|
@pytest.mark.skipif(not TEST_TRUTH_SOCIAL, reason="Truth social download tests disabled in environment variables.")
|
||||||
|
|||||||
@@ -55,6 +55,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
|||||||
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
|
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
|
||||||
("https://www.tiktok.com/t/ZP8YQ8e5j/", True),
|
("https://www.tiktok.com/t/ZP8YQ8e5j/", True),
|
||||||
("https://vt.tiktok.com/ZSMTJeqRP/", True),
|
("https://vt.tiktok.com/ZSMTJeqRP/", True),
|
||||||
|
("https://tiktok.com/@user/photo/123?lang=en", True),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_is_suitable(self, url, is_suitable, tiktok_dropin):
|
def test_is_suitable(self, url, is_suitable, tiktok_dropin):
|
||||||
@@ -68,10 +69,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
|||||||
mock_get.assert_called_once()
|
mock_get.assert_called_once()
|
||||||
mock_get.return_value.json.assert_called_once()
|
mock_get.return_value.json.assert_called_once()
|
||||||
# first message is just the 'Skipping using ytdlp to download files for TikTok' message
|
# first message is just the 'Skipping using ytdlp to download files for TikTok' message
|
||||||
assert (
|
assert "Failed to parse JSON response from tikwm.com" in caplog.text
|
||||||
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
|
|
||||||
in caplog.text
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_get.return_value.json.side_effect = Exception
|
mock_get.return_value.json.side_effect = Exception
|
||||||
with caplog.at_level("ERROR"):
|
with caplog.at_level("ERROR"):
|
||||||
@@ -79,10 +77,7 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
|||||||
mock_get.assert_called()
|
mock_get.assert_called()
|
||||||
assert mock_get.call_count == 2
|
assert mock_get.call_count == 2
|
||||||
assert mock_get.return_value.json.call_count == 2
|
assert mock_get.return_value.json.call_count == 2
|
||||||
assert (
|
assert "Failed to parse JSON response from tikwm.com" in caplog.text
|
||||||
"failed to parse JSON response from tikwm.com for url='https://www.tiktok.com/@example/video/1234'"
|
|
||||||
in caplog.text
|
|
||||||
)
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"response",
|
"response",
|
||||||
@@ -98,27 +93,30 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
|||||||
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
|
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) is False
|
||||||
mock_get.assert_called_once()
|
mock_get.assert_called_once()
|
||||||
mock_get.return_value.json.assert_called_once()
|
mock_get.return_value.json.assert_called_once()
|
||||||
assert "failed to get a valid response from tikwm.com" in caplog.text
|
assert "Unable to download with tikwm.com: " in caplog.text
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"response,has_vid",
|
"response,is_success",
|
||||||
[
|
[
|
||||||
({"data": {"id": 123}}, False),
|
({"data": {"id": 123, "images": []}}, False),
|
||||||
({"data": {"wmplay": "url"}}, True),
|
({"data": {"wmplay": "url", "images": ["img1.jpg"]}}, True),
|
||||||
({"data": {"play": "url"}}, True),
|
({"data": {"play": "url", "images": ["img1.jpg"]}}, True),
|
||||||
|
({"data": {"images": ["img1.jpg"]}}, True),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_correct_extraction(self, mock_get, make_item, response, has_vid, mocker):
|
def test_correct_extraction(self, mock_get, make_item, response, is_success, mocker):
|
||||||
|
data = {k: v for k, v in response.get("data", {}).items()}
|
||||||
mock_get.return_value.status_code = 200
|
mock_get.return_value.status_code = 200
|
||||||
mock_get.return_value.json.return_value = {"msg": "success", **response}
|
mock_get.return_value.json.return_value = {"msg": "success", **response}
|
||||||
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
|
||||||
if not has_vid:
|
total_media = len(data.get("images", [])) + (1 if data.get("wmplay", data.get("play")) else 0)
|
||||||
assert result is False
|
if is_success:
|
||||||
else:
|
|
||||||
assert result.is_success()
|
assert result.is_success()
|
||||||
assert len(result.media) == 1
|
assert len(result.media) == total_media
|
||||||
|
else:
|
||||||
|
assert result is False
|
||||||
mock_get.assert_called()
|
mock_get.assert_called()
|
||||||
assert mock_get.call_count == 1 + int(has_vid)
|
assert mock_get.call_count == 1 + total_media
|
||||||
mock_get.return_value.json.assert_called_once()
|
mock_get.return_value.json.assert_called_once()
|
||||||
|
|
||||||
def test_correct_data_extracted(self, mock_get, make_item):
|
def test_correct_data_extracted(self, mock_get, make_item):
|
||||||
@@ -142,7 +140,9 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
|
|||||||
assert len(result.media) == 2
|
assert len(result.media) == 2
|
||||||
assert result.get_title() == "Title"
|
assert result.get_title() == "Title"
|
||||||
assert result.get("author") == "Author"
|
assert result.get("author") == "Author"
|
||||||
assert result.get("api_data") == {"other": "data", "id": 123}
|
assert result.get("other") == "data"
|
||||||
|
assert result.get("comments") is None
|
||||||
|
assert result.get("api_data") == {"id": 123, "other": "data"}
|
||||||
assert result.media[1].get("duration") == 60
|
assert result.media[1].get("duration") == 60
|
||||||
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
|
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user