mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 12:48:28 +03:00
Compare commits
16 Commits
v0.11.1
...
youtube-co
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
82c00d491d | ||
|
|
e49550163f | ||
|
|
e6f5981afc | ||
|
|
c62bf1a34d | ||
|
|
b166d57e61 | ||
|
|
11c3288267 | ||
|
|
004143a58a | ||
|
|
686f0027c4 | ||
|
|
b03cf32c73 | ||
|
|
dc9e64397e | ||
|
|
c7bc5e2988 | ||
|
|
1e375bd740 | ||
|
|
f8824691dd | ||
|
|
012cc36609 | ||
|
|
7cfe1e39cc | ||
|
|
cf8691bad7 |
4
Pipfile
4
Pipfile
@@ -30,10 +30,10 @@ tqdm = "*"
|
||||
jinja2 = "*"
|
||||
cryptography = "*"
|
||||
dataclasses-json = "*"
|
||||
yt-dlp = "*"
|
||||
yt-dlp = "2024.09.27"
|
||||
vk-url-scraper = "*"
|
||||
requests = {extras = ["socks"], version = "*"}
|
||||
numpy = "*"
|
||||
numpy = "1.26.4"
|
||||
warcio = "*"
|
||||
jsonlines = "*"
|
||||
pysubs2 = "*"
|
||||
|
||||
1974
Pipfile.lock
generated
1974
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -47,7 +47,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
|
||||
|
||||
<details><summary><code>Python package instructions</code></summary>
|
||||
|
||||
1. make sure you have python 3.8 or higher installed
|
||||
1. make sure you have python 3.10 or higher installed
|
||||
2. install the package `pip/pipenv/conda install auto-archiver`
|
||||
3. test it's installed with `auto-archiver --help`
|
||||
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
|
||||
@@ -108,7 +108,7 @@ configurations:
|
||||
# ... configurations for the other steps here ...
|
||||
```
|
||||
|
||||
To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
|
||||
To see all available `steps` (which archivers, storages, databases, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
|
||||
|
||||
All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:
|
||||
|
||||
|
||||
@@ -16,8 +16,13 @@ steps:
|
||||
# - wacz_archiver_enricher
|
||||
enrichers:
|
||||
- hash_enricher
|
||||
# - meta_enricher
|
||||
# - metadata_enricher
|
||||
# - screenshot_enricher
|
||||
# - pdq_hash_enricher
|
||||
# - ssl_enricher
|
||||
# - timestamping_enricher
|
||||
# - whisper_enricher
|
||||
# - thumbnail_enricher
|
||||
# - wayback_archiver_enricher
|
||||
# - wacz_archiver_enricher
|
||||
@@ -89,6 +94,14 @@ configurations:
|
||||
password: "vk pass"
|
||||
session_file: "secrets/vk_config.v2.json"
|
||||
|
||||
youtubedl_archiver:
|
||||
subtitles: true
|
||||
# use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
|
||||
# for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
|
||||
# cookie_file: "secrets/youtube_cookies.txt"
|
||||
# cookies_from_browser: firefox
|
||||
# proxy: socks5://proxy-user:password@proxy-ip:port
|
||||
|
||||
screenshot_enricher:
|
||||
width: 1280
|
||||
height: 2300
|
||||
|
||||
@@ -27,7 +27,7 @@ package_dir=
|
||||
=src
|
||||
packages=find:
|
||||
find_packages=true
|
||||
python_requires = >=3.8
|
||||
python_requires = >=3.10
|
||||
|
||||
[options.package_data]
|
||||
* = *.html
|
||||
|
||||
@@ -54,8 +54,9 @@ class InstagramTbotArchiver(Archiver):
|
||||
|
||||
def cleanup(self) -> None:
|
||||
logger.info(f"CLEANUP {self.name}.")
|
||||
if os.path.exists(self.session_file):
|
||||
os.remove(self.session_file)
|
||||
session_file_name = self.session_file + ".session"
|
||||
if os.path.exists(session_file_name):
|
||||
os.remove(session_file_name)
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
url = item.get_url()
|
||||
|
||||
@@ -101,8 +101,9 @@ class TelethonArchiver(Archiver):
|
||||
|
||||
def cleanup(self) -> None:
|
||||
logger.info(f"CLEANUP {self.name}.")
|
||||
if os.path.exists(self.session_file):
|
||||
os.remove(self.session_file)
|
||||
session_file_name = self.session_file + ".session"
|
||||
if os.path.exists(session_file_name):
|
||||
os.remove(session_file_name)
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
"""
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import re, requests, mimetypes, json
|
||||
from typing import Union
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||
from yt_dlp import YoutubeDL
|
||||
from yt_dlp.extractor.twitter import TwitterIE
|
||||
from slugify import slugify
|
||||
|
||||
from . import Archiver
|
||||
@@ -29,7 +32,7 @@ class TwitterArchiver(Archiver):
|
||||
# expand URL if t.co and clean tracker GET params
|
||||
if 'https://t.co/' in url:
|
||||
try:
|
||||
r = requests.get(url)
|
||||
r = requests.get(url, timeout=30)
|
||||
logger.debug(f'Expanded url {url} to {r.url}')
|
||||
url = r.url
|
||||
except:
|
||||
@@ -43,19 +46,31 @@ class TwitterArchiver(Archiver):
|
||||
can handle private/public channels
|
||||
"""
|
||||
url = item.get_url()
|
||||
# detect URLs that we definitely cannot handle
|
||||
username, tweet_id = self.get_username_tweet_id(url)
|
||||
if not username: return False
|
||||
|
||||
result = Metadata()
|
||||
strategies = [self.download_yt_dlp, self.download_snscrape, self.download_syndication]
|
||||
for strategy in strategies:
|
||||
logger.debug(f"Trying {strategy.__name__} for {url=}")
|
||||
try:
|
||||
result = strategy(item, url, tweet_id)
|
||||
if result: return result
|
||||
except Exception as ex:
|
||||
logger.error(f"Failed to download {url} with {strategy.__name__}: {type(ex).__name__} occurred. args: {ex.args}")
|
||||
|
||||
logger.warning(f"No free strategy worked for {url}")
|
||||
return False
|
||||
|
||||
|
||||
def download_snscrape(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
|
||||
scr = TwitterTweetScraper(tweet_id)
|
||||
try:
|
||||
tweet = next(scr.get_items())
|
||||
except Exception as ex:
|
||||
logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
|
||||
return self.download_alternative(item, url, tweet_id)
|
||||
|
||||
logger.warning(f"SNSCRAPE FAILED, can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
|
||||
return False
|
||||
|
||||
result = Metadata()
|
||||
result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date)
|
||||
if tweet.media is None:
|
||||
logger.debug(f'No media found, archiving tweet text only')
|
||||
@@ -85,7 +100,7 @@ class TwitterArchiver(Archiver):
|
||||
|
||||
return result.success("twitter-snscrape")
|
||||
|
||||
def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata:
|
||||
def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
|
||||
"""
|
||||
Hack alternative working again.
|
||||
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
|
||||
@@ -93,12 +108,13 @@ class TwitterArchiver(Archiver):
|
||||
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
|
||||
"""
|
||||
|
||||
logger.debug(f"Trying twitter hack for {url=}")
|
||||
result = Metadata()
|
||||
|
||||
hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
|
||||
r = requests.get(hack_url)
|
||||
if r.status_code != 200: return False
|
||||
if r.status_code != 200 or r.json()=={}:
|
||||
logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
|
||||
return False
|
||||
|
||||
result = Metadata()
|
||||
tweet = r.json()
|
||||
|
||||
urls = []
|
||||
@@ -108,7 +124,7 @@ class TwitterArchiver(Archiver):
|
||||
# 1 tweet has 1 video max
|
||||
if "video" in tweet:
|
||||
v = tweet["video"]
|
||||
urls.append(self.choose_variant(v.get("variants", [])))
|
||||
urls.append(self.choose_variant(v.get("variants", []))['url'])
|
||||
|
||||
logger.debug(f"Twitter hack got {urls=}")
|
||||
|
||||
@@ -124,7 +140,39 @@ class TwitterArchiver(Archiver):
|
||||
result.add_media(media)
|
||||
|
||||
result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
|
||||
return result.success("twitter-hack")
|
||||
return result.success("twitter-syndication")
|
||||
|
||||
def download_yt_dlp(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
|
||||
downloader = YoutubeDL()
|
||||
tie = TwitterIE(downloader)
|
||||
tweet = tie._extract_status(tweet_id)
|
||||
result = Metadata()
|
||||
result\
|
||||
.set_title(tweet.get('full_text', ''))\
|
||||
.set_content(json.dumps(tweet, ensure_ascii=False))\
|
||||
.set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
|
||||
if not tweet.get("entities", {}).get("media"):
|
||||
logger.debug('No media found, archiving tweet text only')
|
||||
result.status = "twitter-ytdl"
|
||||
return result
|
||||
for i, tw_media in enumerate(tweet["entities"]["media"]):
|
||||
media = Media(filename="")
|
||||
mimetype = ""
|
||||
if tw_media["type"] == "photo":
|
||||
media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
|
||||
mimetype = "image/jpeg"
|
||||
elif tw_media["type"] == "video":
|
||||
variant = self.choose_variant(tw_media['video_info']['variants'])
|
||||
media.set("src", variant['url'])
|
||||
mimetype = variant['content_type']
|
||||
elif tw_media["type"] == "animated_gif":
|
||||
variant = tw_media['video_info']['variants'][0]
|
||||
media.set("src", variant['url'])
|
||||
mimetype = variant['content_type']
|
||||
ext = mimetypes.guess_extension(mimetype)
|
||||
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
|
||||
result.add_media(media)
|
||||
return result.success("twitter-ytdl")
|
||||
|
||||
def get_username_tweet_id(self, url):
|
||||
# detect URLs that we definitely cannot handle
|
||||
@@ -140,13 +188,13 @@ class TwitterArchiver(Archiver):
|
||||
# choosing the highest quality possible
|
||||
variant, width, height = None, 0, 0
|
||||
for var in variants:
|
||||
if var.get("type", "") == "video/mp4":
|
||||
width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"])
|
||||
if var.get("content_type", "") == "video/mp4":
|
||||
width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
|
||||
if width_height:
|
||||
w, h = int(width_height[1]), int(width_height[2])
|
||||
if w > width or h > height:
|
||||
width, height = w, h
|
||||
variant = var.get("src", variant)
|
||||
variant = var
|
||||
else:
|
||||
variant = var.get("src") if not variant else variant
|
||||
variant = var if not variant else variant
|
||||
return variant
|
||||
|
||||
@@ -30,6 +30,8 @@ class YoutubeDLArchiver(Archiver):
|
||||
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
|
||||
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
|
||||
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
|
||||
"cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
|
||||
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
|
||||
}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
@@ -38,8 +40,17 @@ class YoutubeDLArchiver(Archiver):
|
||||
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||
|
||||
|
||||
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
|
||||
|
||||
if item.netloc in ['youtube.com', 'www.youtube.com']:
|
||||
if self.cookies_from_browser:
|
||||
logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
|
||||
ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
|
||||
elif self.cookie_file:
|
||||
logger.debug(f'Using cookies from file {self.cookie_file}')
|
||||
ydl_options['cookiefile'] = self.cookie_file
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
try:
|
||||
@@ -98,11 +109,12 @@ class YoutubeDLArchiver(Archiver):
|
||||
result.set("comments", [{
|
||||
"text": c["text"],
|
||||
"author": c["author"],
|
||||
"timestamp": datetime.datetime.utcfromtimestamp(c.get("timestamp")).replace(tzinfo=datetime.timezone.utc)
|
||||
"timestamp": datetime.datetime.fromtimestamp(c.get("timestamp"), tz = datetime.timezone.utc)
|
||||
} for c in info.get("comments", [])])
|
||||
|
||||
if (timestamp := info.get("timestamp")):
|
||||
timestamp = datetime.datetime.utcfromtimestamp(timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
|
||||
#TODO: fix deprecated timestamp,
|
||||
timestamp = datetime.datetime.fromtimestamp(timestamp, tz = datetime.timezone.utc).isoformat()
|
||||
result.set_timestamp(timestamp)
|
||||
if (upload_date := info.get("upload_date")):
|
||||
upload_date = datetime.datetime.strptime(upload_date, '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
|
||||
|
||||
@@ -34,6 +34,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
@@ -113,7 +114,10 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
try:
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
my_env = os.environ.copy()
|
||||
if self.socks_proxy_host and self.socks_proxy_port:
|
||||
if self.proxy_server:
|
||||
logger.debug("Using PROXY_SERVER proxy for browsertrix-crawler")
|
||||
my_env["PROXY_SERVER"] = self.proxy_server
|
||||
elif self.socks_proxy_host and self.socks_proxy_port:
|
||||
logger.debug("Using SOCKS proxy for browsertrix-crawler")
|
||||
my_env["SOCKS_HOST"] = self.socks_proxy_host
|
||||
my_env["SOCKS_PORT"] = str(self.socks_proxy_port)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
|
||||
_MAJOR = "0"
|
||||
_MINOR = "11"
|
||||
_MINOR = "13"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "1"
|
||||
|
||||
Reference in New Issue
Block a user