mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Merge branch 'main' into feat/yt-dlp-pots
# Conflicts: # src/auto_archiver/modules/generic_extractor/__manifest__.py # tests/test_modules.py
This commit is contained in:
@@ -71,7 +71,16 @@ class BaseModule(ABC):
|
||||
:param site: the domain of the site to get authentication information for
|
||||
:param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
|
||||
|
||||
:returns: authdict dict of login information for the given site
|
||||
:returns: authdict dict -> {
|
||||
"username": str,
|
||||
"password": str,
|
||||
"api_key": str,
|
||||
"api_secret": str,
|
||||
"cookie": str,
|
||||
"cookies_file": str,
|
||||
"cookies_from_browser": str,
|
||||
"cookies_jar": CookieJar
|
||||
}
|
||||
|
||||
**Global options:**\n
|
||||
* cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
|
||||
@@ -85,6 +94,7 @@ class BaseModule(ABC):
|
||||
* cookie: str - a cookie string to use for login (specific to this site)\n
|
||||
* cookies_file: str - the path to a cookies file to use for login (specific to this site)\n
|
||||
* cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n
|
||||
|
||||
"""
|
||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
|
||||
|
||||
@@ -5,6 +5,7 @@ by handling user configuration, validating the steps properties, and implementin
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import subprocess
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, TYPE_CHECKING, Type
|
||||
@@ -17,7 +18,7 @@ import os
|
||||
from os.path import join
|
||||
from loguru import logger
|
||||
import auto_archiver
|
||||
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE
|
||||
from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE, SetupError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base_module import BaseModule
|
||||
@@ -85,7 +86,11 @@ class ModuleFactory:
|
||||
if not available:
|
||||
message = f"Module '{module_name}' not found. Are you sure it's installed/exists?"
|
||||
if "archiver" in module_name:
|
||||
message += f" Did you mean {module_name.replace('archiver', 'extractor')}?"
|
||||
message += f" Did you mean '{module_name.replace('archiver', 'extractor')}'?"
|
||||
elif "gsheet" in module_name:
|
||||
message += " Did you mean 'gsheet_feeder_db'?"
|
||||
elif "atlos" in module_name:
|
||||
message += " Did you mean 'atlos_feeder_db_storage'?"
|
||||
raise IndexError(message)
|
||||
return available[0]
|
||||
|
||||
@@ -216,9 +221,9 @@ class LazyBaseModule:
|
||||
if not check(dep):
|
||||
logger.error(
|
||||
f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the README for more information."
|
||||
Have you installed the required dependencies for the '{self.name}' module? See the documentation for more information."
|
||||
)
|
||||
exit(1)
|
||||
raise SetupError()
|
||||
|
||||
def check_python_dep(dep):
|
||||
# first check if it's a module:
|
||||
@@ -237,8 +242,22 @@ class LazyBaseModule:
|
||||
|
||||
return find_spec(dep)
|
||||
|
||||
def check_bin_dep(dep):
|
||||
dep_exists = shutil.which(dep)
|
||||
|
||||
if dep == "docker":
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
# this is only for the WACZ enricher, which requires docker
|
||||
# if we're already running in docker then we don't need docker
|
||||
return True
|
||||
|
||||
# check if docker daemon is running
|
||||
return dep_exists and subprocess.run(["docker", "ps", "-q"]).returncode == 0
|
||||
|
||||
return dep_exists
|
||||
|
||||
check_deps(self.dependencies.get("python", []), check_python_dep)
|
||||
check_deps(self.dependencies.get("bin", []), lambda dep: shutil.which(dep))
|
||||
check_deps(self.dependencies.get("bin", []), check_bin_dep)
|
||||
|
||||
logger.debug(f"Loading module '{self.display_name}'...")
|
||||
|
||||
|
||||
@@ -373,9 +373,17 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
|
||||
if module in invalid_modules:
|
||||
continue
|
||||
|
||||
# check to make sure that we're trying to load it as the correct type - i.e. make sure the user hasn't put it under the wrong 'step'
|
||||
lazy_module: LazyBaseModule = self.module_factory.get_module_lazy(module)
|
||||
if module_type not in lazy_module.type:
|
||||
types = ",".join(f"'{t}'" for t in lazy_module.type)
|
||||
raise SetupError(
|
||||
f"Configuration Error: Module '{module}' is not a {module_type}, but has the types: {types}. Please check you set this module up under the right step in your orchestration file."
|
||||
)
|
||||
|
||||
loaded_module = None
|
||||
try:
|
||||
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
|
||||
loaded_module: BaseModule = lazy_module.load(self.config)
|
||||
except (KeyboardInterrupt, Exception) as e:
|
||||
if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
|
||||
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
|
||||
|
||||
@@ -74,10 +74,6 @@ If you are having issues with the extractor, you can review the version of `yt-d
|
||||
"default": "inf",
|
||||
"help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit.",
|
||||
},
|
||||
"pot_provider": {
|
||||
"default": "bgutils",
|
||||
"help": "The Proof of origin provider method.",
|
||||
},
|
||||
"extractor_args": {
|
||||
"default": {},
|
||||
"help": "Additional arguments to pass to the yt-dlp extractor. See https://github.com/yt-dlp/yt-dlp/blob/master/README.md#extractor-arguments.",
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import re
|
||||
import mimetypes
|
||||
import json
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
@@ -32,6 +31,9 @@ class Twitter(GenericDropin):
|
||||
twid = ie_instance._match_valid_url(url).group("id")
|
||||
return ie_instance._extract_status(twid=twid)
|
||||
|
||||
def keys_to_clean(self, video_data, info_extractor):
|
||||
return ["user", "created_at", "entities", "favorited", "translator_type"]
|
||||
|
||||
def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
|
||||
result = Metadata()
|
||||
try:
|
||||
@@ -42,9 +44,11 @@ class Twitter(GenericDropin):
|
||||
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
|
||||
return False
|
||||
|
||||
result.set_title(tweet.get("full_text", "")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(
|
||||
timestamp
|
||||
)
|
||||
full_text = tweet.pop("full_text", "")
|
||||
author = tweet["user"].get("name", "")
|
||||
result.set("author", author).set_url(url)
|
||||
|
||||
result.set_title(f"{author} - {full_text}").set_content(full_text).set_timestamp(timestamp)
|
||||
if not tweet.get("entities", {}).get("media"):
|
||||
logger.debug("No media found, archiving tweet text only")
|
||||
result.status = "twitter-ytdl"
|
||||
|
||||
@@ -70,10 +70,14 @@
|
||||
- Skips redundant updates for empty or invalid data fields.
|
||||
|
||||
### Setup
|
||||
- Requires a Google Service Account JSON file for authentication, which should be stored in `secrets/gsheets_service_account.json`.
|
||||
To set up a service account, follow the instructions [here](https://gspread.readthedocs.io/en/latest/oauth2.html).
|
||||
- Define the `sheet` or `sheet_id` configuration to specify the sheet to archive.
|
||||
- Customize the column names in your Google sheet using the `columns` configuration.
|
||||
- The Google Sheet can be used soley as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.
|
||||
1. Requires a Google Service Account JSON file for authentication.
|
||||
To set up a service account, follow the instructions in the [how to](https://auto-archiver.readthedocs.io/en/latest/how_to/gsheets_setup.html),
|
||||
or use the script:
|
||||
```
|
||||
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/bellingcat/auto-archiver/refs/heads/main/scripts/generate_google_services.sh)"
|
||||
```
|
||||
2. Create a Google sheet with the required column(s) and then define the `sheet` or `sheet_id` configuration to specify this sheet.
|
||||
3. Customize the column names in your Google sheet using the `columns` configuration.
|
||||
4. The Google Sheet can be used solely as a feeder or as a feeder and database, but note you can't currently feed into the database from an alternate feeder.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -29,6 +29,9 @@ class InstagramExtractor(Extractor):
|
||||
# TODO: links to stories
|
||||
|
||||
def setup(self) -> None:
|
||||
logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.")
|
||||
logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.")
|
||||
|
||||
self.insta = instaloader.Instaloader(
|
||||
download_geotags=True,
|
||||
download_comments=True,
|
||||
|
||||
@@ -19,12 +19,21 @@ class ScreenshotEnricher(Enricher):
|
||||
def enrich(self, to_enrich: Metadata) -> None:
|
||||
url = to_enrich.get_url()
|
||||
|
||||
if UrlUtil.is_auth_wall(url):
|
||||
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
|
||||
return
|
||||
|
||||
logger.debug(f"Enriching screenshot for {url=}")
|
||||
auth = self.auth_for_site(url)
|
||||
|
||||
# screenshot enricher only supports cookie-type auth (selenium)
|
||||
has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
|
||||
|
||||
if UrlUtil.is_auth_wall(url) and not has_valid_auth:
|
||||
logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
|
||||
if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
|
||||
logger.warning(
|
||||
f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
|
||||
Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
|
||||
)
|
||||
return
|
||||
|
||||
with self.webdriver_factory(
|
||||
self.width,
|
||||
self.height,
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"configs": {
|
||||
"profile": {
|
||||
"default": None,
|
||||
"help": "browsertrix-profile (for profile generation see https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles).",
|
||||
"help": "browsertrix-profile (for profile generation see https://crawler.docs.browsertrix.com/user-guide/browser-profiles/).",
|
||||
},
|
||||
"docker_commands": {"default": None, "help": "if a custom docker invocation is needed"},
|
||||
"timeout": {"default": 120, "help": "timeout for WACZ generation in seconds", "type": "int"},
|
||||
@@ -40,14 +40,27 @@
|
||||
Creates .WACZ archives of web pages using the `browsertrix-crawler` tool, with options for media extraction and screenshot saving.
|
||||
[Browsertrix-crawler](https://crawler.docs.browsertrix.com/user-guide/) is a headless browser-based crawler that archives web pages in WACZ format.
|
||||
|
||||
### Features
|
||||
## Setup
|
||||
|
||||
**Docker**
|
||||
If you are using the Docker file to run Auto Archiver (recommended), then everything is set up and you can use WACZ out of the box!
|
||||
Otherwise, if you are using a local install of Auto Archiver (e.g. pip or dev install), then you will need to install Docker and run
|
||||
the docker daemon to be able to run the `browsertrix-crawler` tool.
|
||||
|
||||
**Browsertrix Profiles**
|
||||
A browsertrix profile is a custom browser profile (login information, browser extensions, etc.) that can be used to archive private or dynamic content.
|
||||
You can run the WACZ Enricher without a profile, but for more resilient archiving, it is recommended to create a profile. See the [Browsertrix documentation](https://crawler.docs.browsertrix.com/user-guide/browser-profiles/)
|
||||
for more information.
|
||||
|
||||
** Docker in Docker **
|
||||
If you are running Auto Archiver within a Docker container, you will need to enable Docker in Docker to run the `browsertrix-crawler` tool.
|
||||
This can be done by setting the `WACZ_ENABLE_DOCKER` environment variable to `1`.
|
||||
|
||||
## Features
|
||||
- Archives web pages into .WACZ format using Docker or direct invocation of `browsertrix-crawler`.
|
||||
- Supports custom profiles for archiving private or dynamic content.
|
||||
- Extracts media (images, videos, audio) and screenshots from the archive, optionally adding them to the enrichment pipeline.
|
||||
- Generates metadata from the archived page's content and structure (e.g., titles, text).
|
||||
|
||||
### Notes
|
||||
- Requires Docker for running `browsertrix-crawler` .
|
||||
- Configurable via parameters for timeout, media extraction, screenshots, and proxy settings.
|
||||
""",
|
||||
}
|
||||
|
||||
@@ -24,7 +24,8 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
self.use_docker = os.environ.get("WACZ_ENABLE_DOCKER") or not os.environ.get("RUNNING_IN_DOCKER")
|
||||
self.docker_in_docker = os.environ.get("WACZ_ENABLE_DOCKER") and os.environ.get("RUNNING_IN_DOCKER")
|
||||
|
||||
self.cwd_dind = f"/crawls/crawls{random_str(8)}"
|
||||
self.crawl_id = random_str(8)
|
||||
self.cwd_dind = f"/crawls/crawls{self.crawl_id}"
|
||||
self.browsertrix_home_host = os.environ.get("BROWSERTRIX_HOME_HOST")
|
||||
self.browsertrix_home_container = os.environ.get("BROWSERTRIX_HOME_CONTAINER") or self.browsertrix_home_host
|
||||
# create crawls folder if not exists, so it can be safely removed in cleanup
|
||||
@@ -50,7 +51,7 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
|
||||
url = to_enrich.get_url()
|
||||
|
||||
collection = random_str(8)
|
||||
collection = self.crawl_id
|
||||
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(self.tmp_dir)
|
||||
browsertrix_home_container = self.browsertrix_home_container or browsertrix_home_host
|
||||
|
||||
@@ -102,10 +103,11 @@ class WaczExtractorEnricher(Enricher, Extractor):
|
||||
] + cmd
|
||||
|
||||
if self.profile:
|
||||
profile_fn = os.path.join(browsertrix_home_container, "profile.tar.gz")
|
||||
profile_file = f"profile-{self.crawl_id}.tar.gz"
|
||||
profile_fn = os.path.join(browsertrix_home_container, profile_file)
|
||||
logger.debug(f"copying {self.profile} to {profile_fn}")
|
||||
shutil.copyfile(self.profile, profile_fn)
|
||||
cmd.extend(["--profile", os.path.join("/crawls", "profile.tar.gz")])
|
||||
cmd.extend(["--profile", os.path.join("/crawls", profile_file)])
|
||||
|
||||
else:
|
||||
logger.debug(f"generating WACZ without Docker for {url=}")
|
||||
|
||||
@@ -4,8 +4,8 @@ from ipaddress import ip_address
|
||||
|
||||
|
||||
AUTHWALL_URLS = [
|
||||
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
|
||||
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
|
||||
re.compile(r"https?:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
|
||||
re.compile(r"https?:\/\/(www\.)?instagram\.com"), # instagram
|
||||
]
|
||||
|
||||
|
||||
@@ -81,56 +81,43 @@ def is_relevant_url(url: str) -> bool:
|
||||
"""
|
||||
clean_url = remove_get_parameters(url)
|
||||
|
||||
# favicons
|
||||
if "favicon" in url:
|
||||
return False
|
||||
# ifnore icons
|
||||
if clean_url.endswith(".ico"):
|
||||
return False
|
||||
# ignore SVGs
|
||||
if remove_get_parameters(url).endswith(".svg"):
|
||||
return False
|
||||
IRRELEVANT_URLS = [
|
||||
# favicons
|
||||
("favicon",),
|
||||
# twitter profile pictures
|
||||
("twimg.com/profile_images",),
|
||||
("twimg.com", "default_profile_images"),
|
||||
# instagram profile pictures
|
||||
("https://scontent.cdninstagram.com/", "150x150"),
|
||||
# instagram recurring images
|
||||
("https://static.cdninstagram.com/rsrc.php/",),
|
||||
# telegram
|
||||
("https://telegram.org/img/emoji/",),
|
||||
# youtube
|
||||
("https://www.youtube.com/s/gaming/emoji/",),
|
||||
("https://yt3.ggpht.com", "default-user="),
|
||||
("https://www.youtube.com/s/search/audio/",),
|
||||
# ok
|
||||
("https://ok.ru/res/i/",),
|
||||
("https://vk.com/emoji/",),
|
||||
("vk.com/images/",),
|
||||
("vk.com/images/reaction/",),
|
||||
# wikipedia
|
||||
("wikipedia.org/static",),
|
||||
]
|
||||
|
||||
# twitter profile pictures
|
||||
if "twimg.com/profile_images" in url:
|
||||
return False
|
||||
if "twimg.com" in url and "/default_profile_images" in url:
|
||||
return False
|
||||
IRRELEVANT_ENDS_WITH = [
|
||||
".svg", # ignore SVGs
|
||||
".ico", # ignore icons
|
||||
]
|
||||
|
||||
# instagram profile pictures
|
||||
if "https://scontent.cdninstagram.com/" in url and "150x150" in url:
|
||||
return False
|
||||
# instagram recurring images
|
||||
if "https://static.cdninstagram.com/rsrc.php/" in url:
|
||||
return False
|
||||
for end in IRRELEVANT_ENDS_WITH:
|
||||
if clean_url.endswith(end):
|
||||
return False
|
||||
|
||||
# telegram
|
||||
if "https://telegram.org/img/emoji/" in url:
|
||||
return False
|
||||
|
||||
# youtube
|
||||
if "https://www.youtube.com/s/gaming/emoji/" in url:
|
||||
return False
|
||||
if "https://yt3.ggpht.com" in url and "default-user=" in url:
|
||||
return False
|
||||
if "https://www.youtube.com/s/search/audio/" in url:
|
||||
return False
|
||||
|
||||
# ok
|
||||
if " https://ok.ru/res/i/" in url:
|
||||
return False
|
||||
|
||||
# vk
|
||||
if "https://vk.com/emoji/" in url:
|
||||
return False
|
||||
if "vk.com/images/" in url:
|
||||
return False
|
||||
if "vk.com/images/reaction/" in url:
|
||||
return False
|
||||
|
||||
# wikipedia
|
||||
if "wikipedia.org/static" in url:
|
||||
return False
|
||||
for parts in IRRELEVANT_URLS:
|
||||
if all(part in clean_url for part in parts):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@@ -22,35 +22,35 @@ from loguru import logger
|
||||
|
||||
class CookieSettingDriver(webdriver.Firefox):
|
||||
facebook_accept_cookies: bool
|
||||
cookies: str
|
||||
cookiejar: MozillaCookieJar
|
||||
cookie: str
|
||||
cookie_jar: MozillaCookieJar
|
||||
|
||||
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
|
||||
def __init__(self, cookie, cookie_jar, facebook_accept_cookies, *args, **kwargs):
|
||||
if os.environ.get("RUNNING_IN_DOCKER"):
|
||||
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
|
||||
kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver")
|
||||
|
||||
super(CookieSettingDriver, self).__init__(*args, **kwargs)
|
||||
self.cookies = cookies
|
||||
self.cookiejar = cookiejar
|
||||
self.cookie = cookie
|
||||
self.cookie_jar = cookie_jar
|
||||
self.facebook_accept_cookies = facebook_accept_cookies
|
||||
|
||||
def get(self, url: str):
|
||||
if self.cookies or self.cookiejar:
|
||||
if self.cookie_jar or self.cookie:
|
||||
# set up the driver to make it not 'cookie averse' (needs a context/URL)
|
||||
# get the 'robots.txt' file which should be quick and easy
|
||||
robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment=""))
|
||||
super(CookieSettingDriver, self).get(robots_url)
|
||||
|
||||
if self.cookies:
|
||||
if self.cookie:
|
||||
# an explicit cookie is set for this site, use that first
|
||||
for cookie in self.cookies.split(";"):
|
||||
for name, value in cookie.split("="):
|
||||
self.driver.add_cookie({"name": name, "value": value})
|
||||
elif self.cookiejar:
|
||||
domain = urlparse(url).netloc
|
||||
elif self.cookie_jar:
|
||||
domain = urlparse(url).netloc.removeprefix("www.")
|
||||
regex = re.compile(f"(www)?.?{domain}$")
|
||||
for cookie in self.cookiejar:
|
||||
for cookie in self.cookie_jar:
|
||||
if regex.match(cookie.domain):
|
||||
try:
|
||||
self.add_cookie(
|
||||
@@ -145,8 +145,8 @@ class Webdriver:
|
||||
|
||||
try:
|
||||
self.driver = CookieSettingDriver(
|
||||
cookies=self.auth.get("cookies"),
|
||||
cookiejar=self.auth.get("cookies_jar"),
|
||||
cookie=self.auth.get("cookie"),
|
||||
cookie_jar=self.auth.get("cookies_jar"),
|
||||
facebook_accept_cookies=self.facebook_accept_cookies,
|
||||
options=options,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user