Compare commits

...

11 Commits

Author SHA1 Message Date
msramalho
1e66a2c905 Bump version to v0.6.0 for release 2023-07-27 15:42:29 +01:00
msramalho
e8f44b652e minor improvements 2023-07-27 15:42:23 +01:00
msramalho
dd034da844 feat: WACZ enricher can now be probed for media, and used as an archiver OR enricher 2023-07-27 15:42:10 +01:00
msramalho
65e3c99483 Bump version to v0.5.28 for release 2023-07-26 16:13:14 +01:00
msramalho
888ad8f004 fix: twitter hack videos extension detection 2023-07-26 16:12:56 +01:00
msramalho
086a9e6c84 fix: remove unnecessary log 2023-07-11 12:17:15 +01:00
msramalho
4d80ee6f02 Bump version to v0.5.27 for release 2023-07-11 12:16:06 +01:00
msramalho
92569ae6be fix: telegram archiver was outdated for images 2023-07-11 12:15:56 +01:00
msramalho
abaf86c776 Bump version to v0.5.26 for release 2023-07-02 18:42:59 +02:00
msramalho
8005a1955a fixes #82 twitter api walls 2023-07-02 18:42:43 +02:00
msramalho
b7889a182d readme update 2023-06-26 18:18:46 +01:00
15 changed files with 699 additions and 598 deletions

View File

@@ -30,12 +30,13 @@ tqdm = "*"
jinja2 = "*" jinja2 = "*"
cryptography = "*" cryptography = "*"
dataclasses-json = "*" dataclasses-json = "*"
yt-dlp = ">=2023.2.17" yt-dlp = "*"
vk-url-scraper = "*" vk-url-scraper = "*"
uwsgi = "*" uwsgi = "*"
requests = {extras = ["socks"], version = "*"} requests = {extras = ["socks"], version = "*"}
# wacz = "==0.4.8" # wacz = "==0.4.8"
numpy = "*" numpy = "*"
warcio = "*"
[requires] [requires]
python_version = "3.10" python_version = "3.10"

1094
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -197,7 +197,8 @@ Outputs:
* **Title**: Post title * **Title**: Post title
* **Text**: Post text * **Text**: Post text
* **Screenshot**: Link to screenshot of post * **Screenshot**: Link to screenshot of post
* **Hash**: Hash of archived HTML file (which contains hashes of post media) * **Hash**: Hash of archived HTML file (which contains hashes of post media) - for checksums/verification
* **Perceptual Hash**: Perceptual hashes of found images - these can be used for de-duplication of content
* **WACZ**: Link to a WACZ web archive of post * **WACZ**: Link to a WACZ web archive of post
* **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive * **ReplayWebpage**: Link to a ReplayWebpage viewer of the WACZ archive

View File

@@ -12,13 +12,14 @@ steps:
# - tiktok_archiver # - tiktok_archiver
- youtubedl_archiver - youtubedl_archiver
# - wayback_archiver_enricher # - wayback_archiver_enricher
# - wacz_archiver_enricher
enrichers: enrichers:
- hash_enricher - hash_enricher
# - screenshot_enricher # - screenshot_enricher
# - thumbnail_enricher # - thumbnail_enricher
# - wayback_archiver_enricher # - wayback_archiver_enricher
# - wacz_enricher # - wacz_archiver_enricher
# - pdq_hash_enricher # - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher
formatter: html_formatter # defaults to mute_formatter formatter: html_formatter # defaults to mute_formatter
storages: storages:
- local_storage - local_storage
@@ -95,7 +96,7 @@ configurations:
secret: "wayback secret" secret: "wayback secret"
hash_enricher: hash_enricher:
algorithm: "SHA3-512" # can also be SHA-256 algorithm: "SHA3-512" # can also be SHA-256
wacz_enricher: wacz_archiver_enricher:
profile: secrets/profile.tar.gz profile: secrets/profile.tar.gz
local_storage: local_storage:
save_to: "./local_archive" save_to: "./local_archive"

View File

@@ -48,7 +48,7 @@ class TelegramArchiver(Archiver):
video = s.find("video") video = s.find("video")
if video is None: if video is None:
logger.warning("could not find video") logger.warning("could not find video")
image_tags = s.find_all(class_="js-message_photo") image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
image_urls = [] image_urls = []
for im in image_tags: for im in image_tags:

View File

@@ -6,6 +6,7 @@ from slugify import slugify
from . import Archiver from . import Archiver
from ..core import Metadata, Media from ..core import Metadata, Media
from ..utils import UrlUtil
class TwitterArchiver(Archiver): class TwitterArchiver(Archiver):
@@ -77,7 +78,7 @@ class TwitterArchiver(Archiver):
media.set("src", variant.url) media.set("src", variant.url)
mimetype = variant.contentType mimetype = variant.contentType
elif type(tweet_media) == Photo: elif type(tweet_media) == Photo:
media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig')) media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig').replace('name=small', 'name=orig'))
mimetype = "image/jpeg" mimetype = "image/jpeg"
else: else:
logger.warning(f"Could not get media URL of {tweet_media}") logger.warning(f"Could not get media URL of {tweet_media}")
@@ -90,20 +91,22 @@ class TwitterArchiver(Archiver):
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata: def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
""" """
CURRENTLY STOPPED WORKING Hack alternative working again.
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
""" """
return False
# https://stackoverflow.com/a/71867055/6196010
logger.debug(f"Trying twitter hack for {url=}") logger.debug(f"Trying twitter hack for {url=}")
result = Metadata() result = Metadata()
hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}" hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
r = requests.get(hack_url) r = requests.get(hack_url)
if r.status_code != 200: return False if r.status_code != 200: return False
tweet = r.json() tweet = r.json()
urls = [] urls = []
for p in tweet["photos"]: for p in tweet.get("photos", []):
urls.append(p["url"]) urls.append(p["url"])
# 1 tweet has 1 video max # 1 tweet has 1 video max
@@ -113,14 +116,18 @@ class TwitterArchiver(Archiver):
logger.debug(f"Twitter hack got {urls=}") logger.debug(f"Twitter hack got {urls=}")
for u in urls: for i, u in enumerate(urls):
media = Media() media = Media(filename="")
media.set("src", u) media.set("src", u)
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}', item) ext = ""
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
ext = mimetypes.guess_extension(mtype)
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
result.add_media(media) result.add_media(media)
result.set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")) result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result return result.success("twitter-hack")
def get_username_tweet_id(self, url): def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle

View File

@@ -109,6 +109,8 @@ class ArchivingOrchestrator:
# looks for Media in result.media and also result.media[x].properties (as list or dict values) # looks for Media in result.media and also result.media[x].properties (as list or dict values)
result.store() result.store()
#TODO: remove any duplicate media, if hash is available
# 6 - format and store formatted if needed # 6 - format and store formatted if needed
# enrichers typically need access to already stored URLs etc # enrichers typically need access to already stored URLs etc
if (final_media := self.formatter.format(result)): if (final_media := self.formatter.format(result)):

View File

@@ -67,7 +67,7 @@ class GsheetsDb(Database):
batch_if_valid('title', item.get_title()) batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")) batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp()) batch_if_valid('timestamp', item.get_timestamp())
batch_if_valid('hash', media.get("hash", "not-calculated")) if media: batch_if_valid('hash', media.get("hash", "not-calculated"))
# merge all pdq hashes into a single string, if present # merge all pdq hashes into a single string, if present
pdq_hashes = [] pdq_hashes = []

View File

@@ -3,6 +3,6 @@ from .screenshot_enricher import ScreenshotEnricher
from .wayback_enricher import WaybackArchiverEnricher from .wayback_enricher import WaybackArchiverEnricher
from .hash_enricher import HashEnricher from .hash_enricher import HashEnricher
from .thumbnail_enricher import ThumbnailEnricher from .thumbnail_enricher import ThumbnailEnricher
from .wacz_enricher import WaczEnricher from .wacz_enricher import WaczArchiverEnricher
from .whisper_enricher import WhisperEnricher from .whisper_enricher import WhisperEnricher
from .pdq_hash_enricher import PdqHashEnricher from .pdq_hash_enricher import PdqHashEnricher

View File

@@ -1,6 +1,7 @@
import traceback
import pdqhash import pdqhash
import numpy as np import numpy as np
from PIL import Image from PIL import Image, UnidentifiedImageError
from loguru import logger from loguru import logger
from . import Enricher from . import Enricher
@@ -33,10 +34,14 @@ class PdqHashEnricher(Enricher):
def calculate_pdq_hash(self, filename): def calculate_pdq_hash(self, filename):
# returns a hexadecimal string with the perceptual hash for the given filename # returns a hexadecimal string with the perceptual hash for the given filename
with Image.open(filename) as img: try:
# convert the image to RGB with Image.open(filename) as img:
image_rgb = np.array(img.convert("RGB")) # convert the image to RGB
# compute the 256-bit PDQ hash (we do not store the quality score) image_rgb = np.array(img.convert("RGB"))
hash_array, _ = pdqhash.compute(image_rgb) # compute the 256-bit PDQ hash (we do not store the quality score)
hash = "".join(str(b) for b in hash_array) hash_array, _ = pdqhash.compute(image_rgb)
return hex(int(hash, 2))[2:] hash = "".join(str(b) for b in hash_array)
return hex(int(hash, 2))[2:]
except UnidentifiedImageError as e:
logger.error(f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}")
return ""

View File

@@ -1,16 +1,23 @@
import mimetypes
import os, shutil, subprocess, uuid import os, shutil, subprocess, uuid
from zipfile import ZipFile
from loguru import logger from loguru import logger
from warcio.archiveiterator import ArchiveIterator
from ..core import Media, Metadata, ArchivingContext from ..core import Media, Metadata, ArchivingContext
from . import Enricher from . import Enricher
from ..archivers import Archiver
from ..utils import UrlUtil from ..utils import UrlUtil
class WaczEnricher(Enricher): class WaczArchiverEnricher(Enricher, Archiver):
""" """
Submits the current URL to the webarchive and returns a job_id or completed archive Uses https://github.com/webrecorder/browsertrix-crawler to generate a .WACZ archive of the URL
If used with [profiles](https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
it can become quite powerful for archiving private content.
When used as an archiver it will extract the media from the .WACZ archive so it can be enriched.
""" """
name = "wacz_enricher" name = "wacz_archiver_enricher"
def __init__(self, config: dict) -> None: def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called # without this STEP.__init__ is not called
@@ -20,12 +27,24 @@ class WaczEnricher(Enricher):
def configs() -> dict: def configs() -> dict:
return { return {
"profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."}, "profile": {"default": None, "help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)."},
"timeout": {"default": 90, "help": "timeout for WACZ generation in seconds"}, "timeout": {"default": 120, "help": "timeout for WACZ generation in seconds"},
"ignore_auth_wall": {"default": True, "help": "skip URL if it is behind authentication wall, set to False if you have browsertrix profile configured for private content."}, "extract_media": {"default": True, "help": "If enabled all the images/videos/audio present in the WACZ archive will be extracted into separate Media. The .wacz file will be kept untouched."}
} }
def download(self, item: Metadata) -> Metadata:
# this new Metadata object is required to avoid duplication
result = Metadata()
result.merge(item)
if self.enrich(result):
return result.success("wacz")
def enrich(self, to_enrich: Metadata) -> bool: def enrich(self, to_enrich: Metadata) -> bool:
if to_enrich.get_media_by_id("browsertrix"):
logger.info(f"WACZ enricher had already been executed: {to_enrich.get_media_by_id('browsertrix')}")
return True
url = to_enrich.get_url() url = to_enrich.get_url()
logger.warning(f"ENRICHING WACZ for {url=}")
collection = str(uuid.uuid4())[0:8] collection = str(uuid.uuid4())[0:8]
browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir()) browsertrix_home = os.path.abspath(ArchivingContext.get_tmp_dir())
@@ -79,8 +98,6 @@ class WaczEnricher(Enricher):
logger.error(f"WACZ generation failed: {e}") logger.error(f"WACZ generation failed: {e}")
return False return False
if os.getenv('RUNNING_IN_DOCKER'): if os.getenv('RUNNING_IN_DOCKER'):
filename = os.path.join("collections", collection, f"{collection}.wacz") filename = os.path.join("collections", collection, f"{collection}.wacz")
else: else:
@@ -91,3 +108,55 @@ class WaczEnricher(Enricher):
return False return False
to_enrich.add_media(Media(filename), "browsertrix") to_enrich.add_media(Media(filename), "browsertrix")
if self.extract_media:
self.extract_media_from_wacz(to_enrich, filename)
return True
def extract_media_from_wacz(self, to_enrich: Metadata, wacz_filename: str) -> None:
"""
Receives a .wacz archive, and extracts all relevant media from it, adding them to to_enrich.
"""
logger.info(f"WACZ extract_media flag is set, extracting media from {wacz_filename=}")
# unzipping the .wacz
tmp_dir = ArchivingContext.get_tmp_dir()
unzipped_dir = os.path.join(tmp_dir, "unzipped")
with ZipFile(wacz_filename, 'r') as z_obj:
z_obj.extractall(path=unzipped_dir)
# if warc is split into multiple gzip chunks, merge those
warc_dir = os.path.join(unzipped_dir, "archive")
warc_filename = os.path.join(tmp_dir, "merged.warc")
with open(warc_filename, 'wb') as outfile:
for filename in sorted(os.listdir(warc_dir)):
if filename.endswith('.gz'):
chunk_file = os.path.join(warc_dir, filename)
with open(chunk_file, 'rb') as infile:
shutil.copyfileobj(infile, outfile)
# get media out of .warc
counter = 0
with open(warc_filename, 'rb') as warc_stream:
for record in ArchiveIterator(warc_stream):
# only include fetched resources
if record.rec_type != 'response': continue
record_url = record.rec_headers.get_header('WARC-Target-URI')
if not UrlUtil.is_relevant_url(record_url):
logger.debug(f"Skipping irrelevant URL {record_url} but it's still present in the WACZ.")
continue
# filter by media mimetypes
content_type = record.http_headers.get("Content-Type")
if not content_type: continue
if not any(x in content_type for x in ["video", "image", "audio"]): continue
# create local file and add media
ext = mimetypes.guess_extension(content_type)
fn = os.path.join(tmp_dir, f"warc-file-{counter}{ext}")
with open(fn, "wb") as outf: outf.write(record.raw_stream.read())
m = Media(filename=fn)
m.set("src", record_url)
# TODO URLUTIL to ignore known-recurring media like favicons, profile pictures, etc.
to_enrich.add_media(m, f"browsertrix-media-{counter}")
counter += 1
logger.info(f"WACZ extract_media finished, found {counter} relevant media file(s)")

View File

@@ -28,6 +28,7 @@ class WaybackArchiverEnricher(Enricher, Archiver):
} }
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
# this new Metadata object is required to avoid duplication
result = Metadata() result = Metadata()
result.merge(item) result.merge(item)
if self.enrich(result): if self.enrich(result):

View File

@@ -20,7 +20,6 @@ def expand_url(url):
logger.error(f'Failed to expand url {url}') logger.error(f'Failed to expand url {url}')
return url return url
def getattr_or(o: object, prop: str, default=None): def getattr_or(o: object, prop: str, default=None):
try: try:
res = getattr(o, prop) res = getattr(o, prop)

View File

@@ -1,14 +1,16 @@
import re import re
from urllib.parse import urlparse, urlunparse
class UrlUtil: class UrlUtil:
telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)") telegram_private = re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)")
is_istagram = re.compile(r"https:\/\/www\.instagram\.com") is_istagram = re.compile(r"https:\/\/www\.instagram\.com")
@staticmethod @staticmethod
def clean(url): return url def clean(url: str) -> str: return url
@staticmethod @staticmethod
def is_auth_wall(url): def is_auth_wall(url: str) -> bool:
""" """
checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work checks if URL is behind an authentication wall meaning steps like wayback, wacz, ... may not work
""" """
@@ -17,3 +19,28 @@ class UrlUtil:
return False return False
@staticmethod
def remove_get_parameters(url: str) -> str:
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
# useful for mimetypes to work
parsed_url = urlparse(url)
new_url = urlunparse(parsed_url._replace(query=''))
return new_url
@staticmethod
def is_relevant_url(url: str) -> bool:
"""
Detect if a detected media URL is recurring and therefore irrelevant to a specific archive. Useful, for example, for the enumeration of the media files in WARC files which include profile pictures, favicons, etc.
"""
clean_url = UrlUtil.remove_get_parameters(url)
# favicons
if "favicon" in url: return False
# ifnore icons
if clean_url.endswith(".ico"): return False
# ignore SVGs
if UrlUtil.remove_get_parameters(url).endswith(".svg"): return False
# twitter profile pictures
if "twimg.com/profile_images" in url: return False
return True

View File

@@ -1,9 +1,9 @@
_MAJOR = "0" _MAJOR = "0"
_MINOR = "5" _MINOR = "6"
# On main and in a nightly release the patch should be one ahead of the last # On main and in a nightly release the patch should be one ahead of the last
# released build. # released build.
_PATCH = "25" _PATCH = "0"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See # This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics. # https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = "" _SUFFIX = ""