Compare commits

..

2 Commits

Author SHA1 Message Date
msramalho
a455728673 version bump 2024-04-16 12:44:42 +01:00
msramalho
8d4357a22c closes #135 2024-04-16 12:44:32 +01:00
9 changed files with 1034 additions and 1033 deletions

View File

@@ -30,10 +30,10 @@ tqdm = "*"
jinja2 = "*"
cryptography = "*"
dataclasses-json = "*"
yt-dlp = "2024.09.27"
yt-dlp = "*"
vk-url-scraper = "*"
requests = {extras = ["socks"], version = "*"}
numpy = "1.26.4"
numpy = "*"
warcio = "*"
jsonlines = "*"
pysubs2 = "*"

1972
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -47,7 +47,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
<details><summary><code>Python package instructions</code></summary>
1. make sure you have python 3.10 or higher installed
1. make sure you have python 3.8 or higher installed
2. install the package `pip/pipenv/conda install auto-archiver`
3. test it's installed with `auto-archiver --help`
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
@@ -108,7 +108,7 @@ configurations:
# ... configurations for the other steps here ...
```
To see all available `steps` (which archivers, storages, databases, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:

View File

@@ -16,13 +16,8 @@ steps:
# - wacz_archiver_enricher
enrichers:
- hash_enricher
# - meta_enricher
# - metadata_enricher
# - screenshot_enricher
# - pdq_hash_enricher
# - ssl_enricher
# - timestamping_enricher
# - whisper_enricher
# - thumbnail_enricher
# - wayback_archiver_enricher
# - wacz_archiver_enricher
@@ -94,14 +89,6 @@ configurations:
password: "vk pass"
session_file: "secrets/vk_config.v2.json"
youtubedl_archiver:
subtitles: true
# use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
# for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
# cookie_file: "secrets/youtube_cookies.txt"
# cookies_from_browser: firefox
# proxy: socks5://proxy-user:password@proxy-ip:port
screenshot_enricher:
width: 1280
height: 2300

View File

@@ -27,7 +27,7 @@ package_dir=
=src
packages=find:
find_packages=true
python_requires = >=3.10
python_requires = >=3.8
[options.package_data]
* = *.html

View File

@@ -1,5 +1,4 @@
import re, requests, mimetypes, json
from typing import Union
from datetime import datetime
from loguru import logger
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
@@ -32,7 +31,7 @@ class TwitterArchiver(Archiver):
# expand URL if t.co and clean tracker GET params
if 'https://t.co/' in url:
try:
r = requests.get(url, timeout=30)
r = requests.get(url)
logger.debug(f'Expanded url {url} to {r.url}')
url = r.url
except:
@@ -46,31 +45,19 @@ class TwitterArchiver(Archiver):
can handle private/public channels
"""
url = item.get_url()
# detect URLs that we definitely cannot handle
username, tweet_id = self.get_username_tweet_id(url)
if not username: return False
strategies = [self.download_yt_dlp, self.download_snscrape, self.download_syndication]
for strategy in strategies:
logger.debug(f"Trying {strategy.__name__} for {url=}")
try:
result = strategy(item, url, tweet_id)
if result: return result
except Exception as ex:
logger.error(f"Failed to download {url} with {strategy.__name__}: {type(ex).__name__} occurred. args: {ex.args}")
logger.warning(f"No free strategy worked for {url}")
return False
result = Metadata()
def download_snscrape(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
scr = TwitterTweetScraper(tweet_id)
try:
tweet = next(scr.get_items())
except Exception as ex:
logger.warning(f"SNSCRAPE FAILED, can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
return False
result = Metadata()
logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
return self.download_alternative(item, url, tweet_id)
result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date)
if tweet.media is None:
logger.debug(f'No media found, archiving tweet text only')
@@ -100,7 +87,7 @@ class TwitterArchiver(Archiver):
return result.success("twitter-snscrape")
def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
"""
Hack alternative working again.
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
@@ -108,13 +95,14 @@ class TwitterArchiver(Archiver):
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
"""
logger.debug(f"Trying twitter hack for {url=}")
result = Metadata()
hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
r = requests.get(hack_url)
if r.status_code != 200 or r.json()=={}:
logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
return False
result = Metadata()
logger.warning(f"Failed to get tweet information from {hack_url}, trying ytdl")
return self.download_ytdl(item, url, tweet_id)
tweet = r.json()
urls = []
@@ -140,9 +128,9 @@ class TwitterArchiver(Archiver):
result.add_media(media)
result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result.success("twitter-syndication")
def download_yt_dlp(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
return result.success("twitter-hack")
def download_ytdl(self, item: Metadata, url:str, tweet_id:str) -> Metadata:
downloader = YoutubeDL()
tie = TwitterIE(downloader)
tweet = tie._extract_status(tweet_id)
@@ -153,7 +141,6 @@ class TwitterArchiver(Archiver):
.set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
if not tweet.get("entities", {}).get("media"):
logger.debug('No media found, archiving tweet text only')
result.status = "twitter-ytdl"
return result
for i, tw_media in enumerate(tweet["entities"]["media"]):
media = Media(filename="")
@@ -173,6 +160,7 @@ class TwitterArchiver(Archiver):
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
result.add_media(media)
return result.success("twitter-ytdl")
def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle

View File

@@ -30,8 +30,6 @@ class YoutubeDLArchiver(Archiver):
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
"cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
}
def download(self, item: Metadata) -> Metadata:
@@ -40,17 +38,8 @@ class YoutubeDLArchiver(Archiver):
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
if item.netloc in ['youtube.com', 'www.youtube.com']:
if self.cookies_from_browser:
logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
elif self.cookie_file:
logger.debug(f'Using cookies from file {self.cookie_file}')
ydl_options['cookiefile'] = self.cookie_file
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
try:
@@ -109,12 +98,11 @@ class YoutubeDLArchiver(Archiver):
result.set("comments", [{
"text": c["text"],
"author": c["author"],
"timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
"timestamp": datetime.datetime.utcfromtimestamp(c.get("timestamp")).replace(tzinfo=datetime.timezone.utc)
} for c in info.get("comments", [])])
if (timestamp := info.get("timestamp")):
#TODO: fix deprecated timestamp,
timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
timestamp = datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
result.set_timestamp(timestamp)
if (upload_date := info.get("upload_date")):
upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)

View File

@@ -34,7 +34,6 @@ class WaczArchiverEnricher(Enricher, Archiver):
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
}
def setup(self) -> None:
@@ -114,10 +113,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
my_env = os.environ.copy()
if self.proxy_server:
logger.debug("Using PROXY_SERVER proxy for browsertrix-crawler")
my_env["PROXY_SERVER"] = self.proxy_server
elif self.socks_proxy_host and self.socks_proxy_port:
if self.socks_proxy_host and self.socks_proxy_port:
logger.debug("Using SOCKS proxy for browsertrix-crawler")
my_env["SOCKS_HOST"] = self.socks_proxy_host
my_env["SOCKS_PORT"] = str(self.socks_proxy_port)

View File

@@ -1,9 +1,9 @@
_MAJOR = "0"
_MINOR = "13"
_MINOR = "11"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "1"
_PATCH = "3"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""