Compare commits

..

27 Commits

Author SHA1 Message Date
Patrick Robertson
7d972ee9b8 Merge pull request #258 from bellingcat/version_bump
Version bump
2025-03-18 12:18:09 +00:00
Patrick Robertson
b64826dc16 Merge pull request #257 from bellingcat/standardise_parsedates
Standardise parse dates to get_datetime_from_str
2025-03-18 12:17:51 +00:00
Patrick Robertson
23e74803ee Version bump 2025-03-18 10:52:23 +00:00
Patrick Robertson
d03ecdb037 Standardise parse dates to get_datetime_from_str 2025-03-18 10:22:58 +00:00
Patrick Robertson
a5ebbf4726 Merge pull request #256 from bellingcat/dropin_cleanup
Refactor the dropin 'is_suitable' method + fix for tikwm
2025-03-18 10:08:24 +00:00
Patrick Robertson
89e387030d Tests for suitable URLs for tikwm 2025-03-18 10:04:03 +00:00
Patrick Robertson
8ec053ed1b Refactor the dropin 'is_suitable' method + fix tikwm implementation
Makes it easier to maintain/understand.
2025-03-18 09:14:14 +00:00
Patrick Robertson
3ea02c115e Merge pull request #254 from bellingcat/rtd_docs
Add info on building RTD versions + automated building of tagged versions
2025-03-17 13:01:20 +00:00
Patrick Robertson
ab03e48708 Add info on building RTD versions + automated building of tagged versions 2025-03-17 12:52:04 +00:00
Patrick Robertson
3d4056ef70 Merge pull request #223 from bellingcat/facebook_extractor
Create facebook dropin - working for images + text.
2025-03-17 12:45:05 +00:00
Patrick Robertson
51041bf91e Merge pull request #253 from bellingcat/settings_page
Update material version, minify code
2025-03-17 11:59:37 +00:00
Patrick Robertson
f56cd6891b Finish incomplete sentence 2025-03-17 10:33:50 +00:00
Patrick Robertson
0765640bff Fix up tiktok dropin for slightly modified generic_extractor format 2025-03-17 10:31:22 +00:00
Patrick Robertson
06b1f4c0ca Fix lingering merge conflict issues 2025-03-17 10:12:55 +00:00
Patrick Robertson
59b910ec30 Merge main 2025-03-17 10:05:11 +00:00
Patrick Robertson
7e360240bf Copy ytdlp code into AA project - seems like ytdlp won't be merged anytime soon 2025-03-17 09:57:05 +00:00
Patrick Robertson
9e03d745d8 Add '-it' to the list of docker flags, so that docker gives a colour log output 2025-03-17 09:45:12 +00:00
Patrick Robertson
7badf89c28 Create the 'secrets' folder if it doesn't exist on first run
Easier setup for users
2025-03-17 09:40:46 +00:00
Patrick Robertson
d59530c8e7 Fix if logic bug 2025-03-17 09:40:27 +00:00
Patrick Robertson
0ec5451f66 Nicer error log when no URLs provided for CLI feeder - don't need the stacktrace 2025-03-17 09:34:33 +00:00
Patrick Robertson
99e9ac2465 Fix 'Syntax Error' warning in python3.12+ 2025-03-17 09:29:51 +00:00
Patrick Robertson
42162c5e3f Various docs improvements based on Friday Office Hours discussion 2025-03-17 09:23:43 +00:00
Patrick Robertson
3afe519176 Fix link to module types in config editor 2025-03-17 09:17:17 +00:00
Patrick Robertson
f13349bacf Fix incorrect path in cp 2025-03-16 10:33:52 +00:00
Patrick Robertson
92c79ed994 Remove schema.json file from git - is auto-generated on release 2025-03-16 10:27:08 +00:00
Patrick Robertson
2643b8e717 Update material version, minify code 2025-03-16 10:22:54 +00:00
Patrick Robertson
f8e846d59a Create facebook dropin - working for images + text. CAVEAT: only gets the first ~100 chars of the post at the moment 2025-02-25 11:44:35 +00:00
32 changed files with 646 additions and 50644 deletions

1
.gitignore vendored
View File

@@ -34,4 +34,5 @@ docs/_build/
docs/source/autoapi/
docs/source/modules/autogen/
scripts/settings_page.html
scripts/settings/src/schema.json
.vite

View File

@@ -21,7 +21,7 @@ build:
# generate the config editor page. Schema then HTML
- VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry run python scripts/generate_settings_schema.py
# install node dependencies and build the settings
- cd scripts/settings && npm install && npm run build && yes | cp dist/index.html ../../docs/source/installation/settings_base.html && cd ../..
- cd scripts/settings && npm install && npm run build && yes | cp -v dist/index.html ../../docs/source/installation/settings.html && cd ../..
sphinx:

View File

@@ -29,7 +29,7 @@ View the [Installation Guide](https://auto-archiver.readthedocs.io/en/latest/ins
To get started quickly using Docker:
`docker pull bellingcat/auto-archiver && docker run --rm -v secrets:/app/secrets bellingcat/auto-archiver --config secrets/orchestration.yaml`
`docker pull bellingcat/auto-archiver && docker run -it --rm -v secrets:/app/secrets bellingcat/auto-archiver --config secrets/orchestration.yaml`
Or pip:

View File

@@ -36,3 +36,12 @@ open docs/_build/html/index.html
sphinx-autobuild docs/source docs/_build/html
```
### Managing Readthedocs (RTD) Versions
Version management is done at [https://app.readthedocs.org/projects/auto-archiver/](https://app.readthedocs.org/projects/auto-archiver/)
(login required). Once logged in, you can create new versions, delete old versions or change visibility of versions. More info on
[RTD](https://docs.readthedocs.com/platform/stable/versions.html).
Currently, the Auto Archiver project is set up to automatically create a new docs version for each `vX.Y.Z` release. For more on this,
see the RTD [instructions on automation](https://docs.readthedocs.com/platform/stable/guides/automation-rules.html) or edit the existing automation rule in the project settings.

View File

@@ -86,7 +86,7 @@ gsheet_feeder_db:
You can also pass these settings directly on the command line without having to edit the file, here'a an example of how to do that (using docker):
`docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --gsheet_feeder_db.sheet "My Awesome Sheet 2"`.
`docker run -it --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver:dockerize --gsheet_feeder_db.sheet "My Awesome Sheet 2"`.
Here, the sheet name has been overridden/specified in the command line invocation.

View File

@@ -0,0 +1,60 @@
# Frequently Asked Questions
### Q: What websites does the Auto Archiver support?
**A:** The Auto Archiver works for a large variety of sites. Firstly, the Auto Archiver can download
and archive any video website supported by YT-DLP, a powerful video-downloading tool ([full list of of
sites here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)). Aside from these sites,
there are various different 'Extractors' for specific websites. See the full list of extractors that
are available on the [extractors](../modules/extractor.md) page. Some sites supported include:
* Twitter
* Instagram
* Telegram
* VKontact
* Tiktok
* Bluesky
```{note} What websites the Auto Archiver can archie depends on what extractors you have enabled in
your configuration. See [configuration](./configurations.md) for more info.
```
### Q: Does the Auto Archiver only work for social media posts ?
**A:** No, the Auto Archiver can archive any web page on the internet, not just social media posts.
However, for social media posts Auto Archiver can extract more relevant/useful information (such as
post comments, likes, author etc.) which may not be available for a generic website. If you are looking
to more generally archive webpages, then you should make sure to enable the [](../modules/autogen/extractor/wacz_extractor_enricher.md)
and the [](../modules/autogen/extractor/wayback_extractor_enricher.md).
### Q: What kind of data is stored for each webpage that's archived?
**A:** This depends on the website archived, but more generally, for social media posts any videos and photos in
the post will be archived. For video sites, the video will be downloaded separately. For most of these sites, additional
metadata such as published date, uploader/author and ratings/comments will also be saved. Additionally, further data can be
saved depending on the enrichers that you have enabled. Some other types of data saved are timestamps if you have the
[](../modules/autogen/enricher/timestamping_enricher.md) or [](../modules/autogen/enricher/opentimestamps_enricher.md) enabled,
screenshots of the web page with the [](../modules/autogen/enricher/screenshot_enricher.md), and for videos, thumbnails of the
video with the [](../modules/autogen/enricher/thumbnail_enricher.md). You can also store things like hashes (SHA256, or pdq hashes)
with the various hash enrichers.
### Q: Where is my data stored?
**A:** With the default configuration, data is stored on your local computer in the `local_storage` folder. You can adjust these settings by
changing the [storage modules](../modules/storage.md) you have enabled. For example, you could choose to store your data in an S3 bucket or
on Google Drive.
```{note}
You can choose to store your data in multiple places, for example your local drive **and** an S3 bucket for redundancy.
```
### Q: What should I do is something doesn't work?
**A:** First, read through the log files to see if you can find a specific reason why something isn't working. Learn more about logging
and how to enable debug logging in the [Logging Howto](../how_to/logging.md).
If you cannot find an answer in the logs, then try searching this documentation or existing / closed issues on the [Github Issue Tracker](https://github.com/bellingcat/auto-archiver/issues?q=is%3Aissue%20). If you still cannot find an answer, then consider opening an issue on the Github Issue Tracker or asking in the Bellingcat Discord
'Auto Archiver' group.
#### Common reasons why an archiving might not work:
* The website may have temporarily adjusted its settings - sometimes sites like Telegram or Twitter adjust their scraping protection settings. Often,
waiting a day or two and then trying again can work.
* The site requires you to be logged in - you could try using cookies or authentication to bypass any blocks. See [](../installation/authentication.md) for more information.
* The website you're trying to archive has changed its settings/structure. Make sure you're using the latest version of Auto Archiver and try again.

View File

@@ -1,5 +1,11 @@
# Installation
```{toctree}
:maxdepth: 1
upgrading.md
```
There are 3 main ways to use the auto-archiver. We recommend the 'docker' method for most uses. This installs all the requirements in one command.
1. Easiest (recommended): [via docker](#installing-with-docker)

File diff suppressed because one or more lines are too long

View File

@@ -1,7 +1,6 @@
# Getting Started
```{toctree}
:maxdepth: 1
:hidden:
installation.md
@@ -9,6 +8,7 @@ configurations.md
config_editor.md
authentication.md
requirements.md
faq.md
config_cheatsheet.md
```
@@ -27,17 +27,18 @@ The way you run the Auto Archiver depends on how you installed it (docker instal
If you installed Auto Archiver using docker, open up your terminal, and copy-paste / type the following command:
```bash
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
docker run -it --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
```
breaking this command down:
1. `docker run` tells docker to start a new container (an instance of the image)
2. `--rm` makes sure this container is removed after execution (less garbage locally)
3. `-v $PWD/secrets:/app/secrets` - your secrets folder with settings
2. `-it` tells docker to run in 'interactive mode' so that we get nice colour logs
3. `--rm` makes sure this container is removed after execution (less garbage locally)
4. `-v $PWD/secrets:/app/secrets` - your secrets folder with settings
1. `-v` is a volume flag which means a folder that you have on your computer will be connected to a folder inside the docker container
2. `$PWD/secrets` points to a `secrets/` folder in your current working directory (where your console points to), we use this folder as a best practice to hold all the secrets/tokens/passwords/... you use
3. `/app/secrets` points to the path the docker container where this image can be found
4. `-v $PWD/local_archive:/app/local_archive` - (optional) if you use local_storage
5. `-v $PWD/local_archive:/app/local_archive` - (optional) if you use local_storage
1. `-v` same as above, this is a volume instruction
2. `$PWD/local_archive` is a folder `local_archive/` in case you want to archive locally and have the files accessible outside docker
3. `/app/local_archive` is a folder inside docker that you can reference in your orchestration.yml file
@@ -48,14 +49,14 @@ The invocations below will run the auto-archiver Docker image using a configurat
```bash
# Have auto-archiver run with the default settings, generating a settings file in ./secrets/orchestration.yaml
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
docker run -it --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver
# uses the same configuration, but with the `gsheet_feeder`, a header on row 2 and with some different column names
# Note this expects you to have followed the [Google Sheets setup](how_to/google_sheets.md) and added your service_account.json to the `secrets/` folder
# notice that columns is a dictionary so you need to pass it as JSON and it will override only the values provided
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --feeders=gsheet_feeder --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
docker run -it --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --feeders=gsheet_feeder --gsheet_feeder.sheet="use it on another sheets doc" --gsheet_feeder.header=2 --gsheet_feeder.columns='{"url": "link"}'
# Runs auto-archiver for the first time, but in 'full' mode, enabling all modules to get a full settings file
docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --mode full
docker run -it --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive bellingcat/auto-archiver --mode full
```
------------

View File

@@ -0,0 +1,30 @@
# Upgrading
If an update is available, then you will see a message in the logs when you
run Auto Archiver. Here's what those logs look like:
```{code} bash
********* IMPORTANT: UPDATE AVAILABLE ********
A new version of auto-archiver is available (v0.13.6, you have 0.13.4)
Make sure to update to the latest version using: `pip install --upgrade auto-archiver`
```
Upgrading Auto Archiver depends on the way you installed it.
## Docker
To upgrade using docker, update the docker image with:
```
docker pull bellingcat/auto-archiver:latest
```
## Pip
To upgrade the pip package, use:
```
pip install --upgrade auto-archiver
```

View File

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[project]
name = "auto-archiver"
version = "0.13.6"
version = "0.13.7"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13"

View File

@@ -59,4 +59,5 @@ output_schema = {
current_file_dir = os.path.dirname(os.path.abspath(__file__))
output_file = os.path.join(current_file_dir, "settings/src/schema.json")
with open(output_file, "w") as file:
print(f"Writing schema to {output_file}")
json.dump(output_schema, file, indent=4, cls=SchemaEncoder)

View File

@@ -12,7 +12,7 @@
"@dnd-kit/sortable": "^10.0.0",
"@emotion/react": "latest",
"@emotion/styled": "latest",
"@mui/icons-material": "latest",
"@mui/icons-material": "^6.4.7",
"@mui/material": "latest",
"react": "19.0.0",
"react-dom": "19.0.0",
@@ -997,9 +997,9 @@
}
},
"node_modules/@mui/core-downloads-tracker": {
"version": "6.4.6",
"resolved": "https://registry.npmjs.org/@mui/core-downloads-tracker/-/core-downloads-tracker-6.4.6.tgz",
"integrity": "sha512-rho5Q4IscbrVmK9rCrLTJmjLjfH6m/NcqKr/mchvck0EIXlyYUB9+Z0oVmkt/+Mben43LMRYBH8q/Uzxj/c4Vw==",
"version": "6.4.7",
"resolved": "https://registry.npmjs.org/@mui/core-downloads-tracker/-/core-downloads-tracker-6.4.7.tgz",
"integrity": "sha512-XjJrKFNt9zAKvcnoIIBquXyFyhfrHYuttqMsoDS7lM7VwufYG4fAPw4kINjBFg++fqXM2BNAuWR9J7XVIuKIKg==",
"license": "MIT",
"funding": {
"type": "opencollective",
@@ -1007,9 +1007,9 @@
}
},
"node_modules/@mui/icons-material": {
"version": "6.4.6",
"resolved": "https://registry.npmjs.org/@mui/icons-material/-/icons-material-6.4.6.tgz",
"integrity": "sha512-rGJBvIQQbQAlyKYljHQ8wAQS/K2/uYwvemcpygnAmCizmCI4zSF9HQPuiG8Ql4YLZ6V/uKjA3WHIYmF/8sV+pQ==",
"version": "6.4.7",
"resolved": "https://registry.npmjs.org/@mui/icons-material/-/icons-material-6.4.7.tgz",
"integrity": "sha512-Rk8cs9ufQoLBw582Rdqq7fnSXXZTqhYRbpe1Y5SAz9lJKZP3CIdrj0PfG8HJLGw1hrsHFN/rkkm70IDzhJsG1g==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.26.0"
@@ -1022,7 +1022,7 @@
"url": "https://opencollective.com/mui-org"
},
"peerDependencies": {
"@mui/material": "^6.4.6",
"@mui/material": "^6.4.7",
"@types/react": "^17.0.0 || ^18.0.0 || ^19.0.0",
"react": "^17.0.0 || ^18.0.0 || ^19.0.0"
},
@@ -1033,14 +1033,14 @@
}
},
"node_modules/@mui/material": {
"version": "6.4.6",
"resolved": "https://registry.npmjs.org/@mui/material/-/material-6.4.6.tgz",
"integrity": "sha512-6UyAju+DBOdMogfYmLiT3Nu7RgliorimNBny1pN/acOjc+THNFVE7hlxLyn3RDONoZJNDi/8vO4AQQr6dLAXqA==",
"version": "6.4.7",
"resolved": "https://registry.npmjs.org/@mui/material/-/material-6.4.7.tgz",
"integrity": "sha512-K65StXUeGAtFJ4ikvHKtmDCO5Ab7g0FZUu2J5VpoKD+O6Y3CjLYzRi+TMlI3kaL4CL158+FccMoOd/eaddmeRQ==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.26.0",
"@mui/core-downloads-tracker": "^6.4.6",
"@mui/system": "^6.4.6",
"@mui/core-downloads-tracker": "^6.4.7",
"@mui/system": "^6.4.7",
"@mui/types": "^7.2.21",
"@mui/utils": "^6.4.6",
"@popperjs/core": "^2.11.8",
@@ -1061,7 +1061,7 @@
"peerDependencies": {
"@emotion/react": "^11.5.0",
"@emotion/styled": "^11.3.0",
"@mui/material-pigment-css": "^6.4.6",
"@mui/material-pigment-css": "^6.4.7",
"@types/react": "^17.0.0 || ^18.0.0 || ^19.0.0",
"react": "^17.0.0 || ^18.0.0 || ^19.0.0",
"react-dom": "^17.0.0 || ^18.0.0 || ^19.0.0"
@@ -1143,9 +1143,9 @@
}
},
"node_modules/@mui/system": {
"version": "6.4.6",
"resolved": "https://registry.npmjs.org/@mui/system/-/system-6.4.6.tgz",
"integrity": "sha512-FQjWwPec7pMTtB/jw5f9eyLynKFZ6/Ej9vhm5kGdtmts1z5b7Vyn3Rz6kasfYm1j2TfrfGnSXRvvtwVWxjpz6g==",
"version": "6.4.7",
"resolved": "https://registry.npmjs.org/@mui/system/-/system-6.4.7.tgz",
"integrity": "sha512-7wwc4++Ak6tGIooEVA9AY7FhH2p9fvBMORT4vNLMAysH3Yus/9B9RYMbrn3ANgsOyvT3Z7nE+SP8/+3FimQmcg==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.26.0",

View File

@@ -13,7 +13,7 @@
"@dnd-kit/sortable": "^10.0.0",
"@emotion/react": "latest",
"@emotion/styled": "latest",
"@mui/icons-material": "latest",
"@mui/icons-material": "^6.4.7",
"@mui/material": "latest",
"react": "19.0.0",
"react-dom": "19.0.0",

View File

@@ -4,7 +4,7 @@ import Container from '@mui/material/Container';
import Typography from '@mui/material/Typography';
import Box from '@mui/material/Box';
import FileUploadIcon from '@mui/icons-material/FileUpload';
//
import {
DndContext,
closestCenter,
@@ -204,7 +204,7 @@ function ModuleTypes({ stepType, setEnabledModules, enabledModules, configValues
{stepType}
</Typography>
<Typography variant="body1" >
Select the <a href="<a href={`https://auto-archiver.readthedocs.io/en/latest/modules/${stepType.slice(0,-1)}.html`}" target="_blank">{stepType}</a> you wish to enable. Drag to reorder.
Select the <a href={`https://auto-archiver.readthedocs.io/en/latest/modules/${stepType.slice(0,-1)}.html`} target="_blank">{stepType}</a> you wish to enable. Drag to reorder.
</Typography>
</Box>
{showError ? <Typography variant="body1" color="error" >Only one {stepType.slice(0,-1)} can be enabled at a time.</Typography> : null}

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,7 @@ import { viteSingleFile } from "vite-plugin-singlefile"
export default defineConfig({
plugins: [react(), viteSingleFile()],
build: {
minify: false,
sourcemap: true,
// minify: false,
// sourcemap: true,
}
});

View File

@@ -8,6 +8,7 @@ flexible setup in various environments.
import argparse
from ruamel.yaml import YAML, CommentedMap
import json
import os
from loguru import logger
@@ -230,6 +231,10 @@ def read_yaml(yaml_filename: str) -> CommentedMap:
def store_yaml(config: CommentedMap, yaml_filename: str) -> None:
config_to_save = deepcopy(config)
## if the save path is the default location (secrets) then create the 'secrets' folder
if os.path.dirname(yaml_filename) == "secrets":
os.makedirs("secrets", exist_ok=True)
auth_dict = config_to_save.get("authentication", {})
if auth_dict and auth_dict.get("load_from_file"):
# remove all other values from the config, don't want to store it in the config file

View File

@@ -112,7 +112,7 @@ class ArchivingOrchestrator:
def check_steps(self, config):
for module_type in MODULE_TYPES:
if not config["steps"].get(f"{module_type}s", []):
if module_type == "feeder" or module_type == "formatter" and config["steps"].get(f"{module_type}"):
if (module_type == "feeder" or module_type == "formatter") and config["steps"].get(f"{module_type}"):
raise SetupError(
f"It appears you have '{module_type}' set under 'steps' in your configuration file, but as of version 0.13.0 of Auto Archiver, you must use '{module_type}s'. Change this in your configuration file and try again. \
Here's how that would look: \n\nsteps:\n {module_type}s:\n - [your_{module_type}_name_here]\n {'extractors:...' if module_type == 'feeder' else '...'}\n"
@@ -377,7 +377,8 @@ Here's how that would look: \n\nsteps:\n extractors:\n - [your_extractor_name_
try:
loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
except (KeyboardInterrupt, Exception) as e:
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if not isinstance(e, KeyboardInterrupt) and not isinstance(e, SetupError):
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if loaded_module and module_type == "extractor":
loaded_module.cleanup()
raise e

View File

@@ -2,13 +2,14 @@ from loguru import logger
from auto_archiver.core.feeder import Feeder
from auto_archiver.core.metadata import Metadata
from auto_archiver.core.consts import SetupError
class CLIFeeder(Feeder):
def setup(self) -> None:
self.urls = self.config["urls"]
if not self.urls:
raise ValueError(
raise SetupError(
"No URLs provided. Please provide at least one URL via the command line, or set up an alternative feeder. Use --help for more information."
)

View File

@@ -15,6 +15,9 @@ supported by `yt-dlp`, such as YouTube, Facebook, and others. It provides functi
for retrieving videos, subtitles, comments, and other metadata, and it integrates with
the broader archiving framework.
For a full list of video platforms supported by `yt-dlp`, see the
[official documentation](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)
### Features
- Supports downloading videos and playlists.
- Retrieves metadata like titles, descriptions, upload dates, and durations.

View File

@@ -1,3 +1,4 @@
from typing import Type
from yt_dlp.extractor.common import InfoExtractor
from auto_archiver.core.metadata import Metadata
from auto_archiver.core.extractor import Extractor
@@ -24,6 +25,8 @@ class GenericDropin:
"""
extractor: Type[Extractor] = None
def extract_post(self, url: str, ie_instance: InfoExtractor):
"""
This method should return the post data from the url.
@@ -55,3 +58,19 @@ class GenericDropin:
This method should download any additional media from the post.
"""
return metadata
def suitable(self, url, info_extractor: InfoExtractor):
"""
A method to allow dropins to override their InfoExtractor's 'suitable' method.
Dropins should override this method and return True if the url is suitable for the extractor
(based on being able to parse other URLs). See the `suitable_extractors` method in the
`GenericExtractor` class for how this is implemented.
The default behaviour of this method is to return the result of the InfoExtractor's 'suitable' method.
### Example: An example of where this is useful is for the FacebookIE extractor in yt-dlp. By default,
it's 'suitable' method only returns True for video URLs. However, we can override this method in the
Facebook dropin to return True for all Facebook URLs (photo/post types). This way, the Facebook dropin
can be used for all Facebook URLs.
"""
return info_extractor.suitable(url)

View File

@@ -1,17 +1,154 @@
import re
from .dropin import GenericDropin
from auto_archiver.core.metadata import Metadata
from yt_dlp.extractor.facebook import FacebookIE
# TODO: Remove if / when https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
from yt_dlp.utils import (
clean_html,
get_element_by_id,
traverse_obj,
get_first,
merge_dicts,
int_or_none,
parse_count,
)
def _extract_metadata(self, webpage, video_id):
post_data = [
self._parse_json(j, video_id, fatal=False)
for j in re.findall(r"data-sjs>({.*?ScheduledServerJS.*?})</script>", webpage)
]
post = (
traverse_obj(
post_data,
(..., "require", ..., ..., ..., "__bbox", "require", ..., ..., ..., "__bbox", "result", "data"),
expected_type=dict,
)
or []
)
media = traverse_obj(
post,
(
...,
"attachments",
...,
lambda k, v: (k == "media" and str(v["id"]) == video_id and v["__typename"] == "Video"),
),
expected_type=dict,
)
title = get_first(media, ("title", "text"))
description = get_first(media, ("creation_story", "comet_sections", "message", "story", "message", "text"))
page_title = title or self._html_search_regex(
(
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>(?P<content>[^<]*)</h2>',
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(?P<content>.*?)</span>',
self._meta_regex("og:title"),
self._meta_regex("twitter:title"),
r"<title>(?P<content>.+?)</title>",
),
webpage,
"title",
default=None,
group="content",
)
description = description or self._html_search_meta(
["description", "og:description", "twitter:description"], webpage, "description", default=None
)
uploader_data = (
get_first(media, ("owner", {dict}))
or get_first(
post, ("video", "creation_story", "attachments", ..., "media", lambda k, v: k == "owner" and v["name"])
)
or get_first(post, (..., "video", lambda k, v: k == "owner" and v["name"]))
or get_first(post, ("node", "actors", ..., {dict}))
or get_first(post, ("event", "event_creator", {dict}))
or get_first(post, ("video", "creation_story", "short_form_video_context", "video_owner", {dict}))
or {}
)
uploader = uploader_data.get("name") or (
clean_html(get_element_by_id("fbPhotoPageAuthorName", webpage))
or self._search_regex(
(r'ownerName\s*:\s*"([^"]+)"', *self._og_regexes("title")), webpage, "uploader", fatal=False
)
)
timestamp = int_or_none(self._search_regex(r'<abbr[^>]+data-utime=["\'](\d+)', webpage, "timestamp", default=None))
thumbnail = self._html_search_meta(["og:image", "twitter:image"], webpage, "thumbnail", default=None)
# some webpages contain unretrievable thumbnail urls
# like https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=10155168902769113&get_thumbnail=1
# in https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/
if thumbnail and not re.search(r"\.(?:jpg|png)", thumbnail):
thumbnail = None
info_dict = {
"description": description,
"uploader": uploader,
"uploader_id": uploader_data.get("id"),
"timestamp": timestamp,
"thumbnail": thumbnail,
"view_count": parse_count(
self._search_regex(
(r'\bviewCount\s*:\s*["\']([\d,.]+)', r'video_view_count["\']\s*:\s*(\d+)'),
webpage,
"view count",
default=None,
)
),
"concurrent_view_count": get_first(
post, (("video", (..., ..., "attachments", ..., "media")), "liveViewerCount", {int_or_none})
),
**traverse_obj(
post,
(
lambda _, v: video_id in v["url"],
"feedback",
{
"like_count": ("likers", "count", {int}),
"comment_count": ("total_comment_count", {int}),
"repost_count": ("share_count_reduced", {parse_count}),
},
),
get_all=False,
),
}
info_json_ld = self._search_json_ld(webpage, video_id, default={})
info_json_ld["title"] = (
re.sub(r"\s*\|\s*Facebook$", "", title or info_json_ld.get("title") or page_title or "")
or (description or "").replace("\n", " ")
or f"Facebook video #{video_id}"
)
return merge_dicts(info_json_ld, info_dict)
class Facebook(GenericDropin):
def extract_post(self, url: str, ie_instance):
video_id = ie_instance._match_valid_url(url).group("id")
ie_instance._download_webpage(url.replace("://m.facebook.com/", "://www.facebook.com/"), video_id)
webpage = ie_instance._download_webpage(url, ie_instance._match_valid_url(url).group("id"))
def extract_post(self, url: str, ie_instance: FacebookIE):
post_id_regex = r"(?P<id>pfbid[A-Za-z0-9]+|\d+|t\.(\d+\/\d+))"
post_id = re.search(post_id_regex, url).group("id")
webpage = ie_instance._download_webpage(url.replace("://m.facebook.com/", "://www.facebook.com/"), post_id)
# TODO: fix once https://github.com/yt-dlp/yt-dlp/pull/12275 is merged
post_data = ie_instance._extract_metadata(webpage)
# TODO: For long posts, this _extract_metadata only seems to return the first 100 or so characters, followed by ...
# TODO: If/when https://github.com/yt-dlp/yt-dlp/pull/12275 is merged, uncomment next line and delete the one after
# post_data = ie_instance._extract_metadata(webpage, post_id)
post_data = _extract_metadata(ie_instance, webpage, post_id)
return post_data
def create_metadata(self, post: dict, ie_instance, archiver, url):
metadata = archiver.create_metadata(url)
metadata.set_title(post.get("title")).set_content(post.get("description")).set_post_data(post)
return metadata
def create_metadata(self, post: dict, ie_instance: FacebookIE, archiver, url):
result = Metadata()
result.set_content(post.get("description", ""))
result.set_title(post.get("title", ""))
result.set("author", post.get("uploader", ""))
result.set_url(url)
return result
def suitable(self, url, info_extractor: FacebookIE):
regex = r"(?:https?://(?:[\w-]+\.)?(?:facebook\.com||facebookwkhpilnemxj7asaniu7vnjjbiltxjqhye3mhbshg7kx5tfyd\.onion)/)"
return re.match(regex, url)
def skip_ytdlp_download(self, url: str, is_instance: FacebookIE):
"""
Skip using the ytdlp download method for Facebook *photo* posts, they have a URL with an id of t.XXXXX/XXXXX
"""
if re.search(r"/t.\d+/\d+", url):
return True

View File

@@ -13,6 +13,8 @@ from loguru import logger
from auto_archiver.core.extractor import Extractor
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import get_datetime_from_str
from .dropin import GenericDropin
class SkipYtdlp(Exception):
@@ -67,7 +69,14 @@ class GenericExtractor(Extractor):
"""
Returns a list of valid extractors for the given URL"""
for info_extractor in yt_dlp.YoutubeDL()._ies.values():
if info_extractor.suitable(url) and info_extractor.working():
if not info_extractor.working():
continue
# check if there's a dropin and see if that declares whether it's suitable
dropin: GenericDropin = self.dropin_for_name(info_extractor.ie_key())
if dropin and dropin.suitable(url, info_extractor):
yield info_extractor
elif info_extractor.suitable(url):
yield info_extractor
def suitable(self, url: str) -> bool:
@@ -188,9 +197,13 @@ class GenericExtractor(Extractor):
result = self.download_additional_media(video_data, info_extractor, result)
# keep both 'title' and 'fulltitle', but prefer 'title', falling back to 'fulltitle' if it doesn't exist
result.set_title(video_data.pop("title", video_data.pop("fulltitle", "")))
result.set_url(url)
if "description" in video_data:
if not result.get_title():
result.set_title(video_data.pop("title", video_data.pop("fulltitle", "")))
if not result.get("url"):
result.set_url(url)
if "description" in video_data and not result.get_content():
result.set_content(video_data["description"])
# extract comments if enabled
if self.comments:
@@ -207,11 +220,11 @@ class GenericExtractor(Extractor):
)
# then add the common metadata
if timestamp := video_data.pop("timestamp", None):
if timestamp := video_data.pop("timestamp", None) and not result.get("timestamp"):
timestamp = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc).isoformat()
result.set_timestamp(timestamp)
if upload_date := video_data.pop("upload_date", None):
upload_date = datetime.datetime.strptime(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
if upload_date := video_data.pop("upload_date", None) and not result.get("upload_date"):
upload_date = get_datetime_from_str(upload_date, "%Y%m%d").replace(tzinfo=datetime.timezone.utc)
result.set("upload_date", upload_date)
# then clean away any keys we don't want
@@ -240,7 +253,8 @@ class GenericExtractor(Extractor):
return False
post_data = dropin.extract_post(url, ie_instance)
return dropin.create_metadata(post_data, ie_instance, self, url)
result = dropin.create_metadata(post_data, ie_instance, self, url)
return self.add_metadata(post_data, info_extractor, url, result)
def get_metadata_for_video(
self, data: dict, info_extractor: Type[InfoExtractor], url: str, ydl: yt_dlp.YoutubeDL
@@ -285,7 +299,7 @@ class GenericExtractor(Extractor):
return self.add_metadata(data, info_extractor, url, result)
def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> Type[InfoExtractor]:
def dropin_for_name(self, dropin_name: str, additional_paths=[], package=__package__) -> GenericDropin:
dropin_name = dropin_name.lower()
if dropin_name == "generic":
@@ -296,6 +310,7 @@ class GenericExtractor(Extractor):
def _load_dropin(dropin):
dropin_class = getattr(dropin, dropin_class_name)()
dropin.extractor = self
return self._dropins.setdefault(dropin_name, dropin_class)
try:
@@ -340,7 +355,7 @@ class GenericExtractor(Extractor):
dropin_submodule = self.dropin_for_name(info_extractor.ie_key())
try:
if dropin_submodule and dropin_submodule.skip_ytdlp_download(info_extractor, url):
if dropin_submodule and dropin_submodule.skip_ytdlp_download(url, info_extractor):
logger.debug(f"Skipping using ytdlp to download files for {info_extractor.ie_key()}")
raise SkipYtdlp()
@@ -359,7 +374,7 @@ class GenericExtractor(Extractor):
if not isinstance(e, SkipYtdlp):
logger.debug(
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use extractor to get post data instead'
f'Issue using "{info_extractor.IE_NAME}" extractor to download video (error: {repr(e)}), attempting to use dropin to get post data instead'
)
try:

View File

@@ -1,5 +1,8 @@
import requests
from loguru import logger
from yt_dlp.extractor.tiktok import TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE
from auto_archiver.core import Metadata, Media
from datetime import datetime, timezone
from .dropin import GenericDropin
@@ -13,6 +16,11 @@ class Tiktok(GenericDropin):
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
def suitable(self, url, info_extractor) -> bool:
"""This dropin (which uses Tikvm) is suitable for *all* Tiktok type URLs - videos, lives, VMs, and users.
Return the 'suitable' method from the TikTokIE class."""
return any(extractor().suitable(url) for extractor in (TikTokIE, TikTokLiveIE, TikTokVMIE, TikTokUserIE))
def extract_post(self, url: str, ie_instance):
logger.debug(f"Using Tikwm API to attempt to download tiktok video from {url=}")
@@ -38,6 +46,9 @@ class Tiktok(GenericDropin):
api_data["video_url"] = video_url
return api_data
def keys_to_clean(self, video_data: dict, info_extractor):
return ["video_url", "title", "create_time", "author", "cover", "origin_cover", "ai_dynamic_cover", "duration"]
def create_metadata(self, post: dict, ie_instance, archiver, url):
# prepare result, start by downloading video
result = Metadata()
@@ -54,17 +65,17 @@ class Tiktok(GenericDropin):
logger.error(f"failed to download video from {video_url}")
return False
video_media = Media(video_downloaded)
if duration := post.pop("duration", None):
if duration := post.get("duration", None):
video_media.set("duration", duration)
result.add_media(video_media)
# add remaining metadata
result.set_title(post.pop("title", ""))
result.set_title(post.get("title", ""))
if created_at := post.pop("create_time", None):
if created_at := post.get("create_time", None):
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
if author := post.pop("author", None):
if author := post.get("author", None):
result.set("author", author)
result.set("api_data", post)

View File

@@ -1,13 +1,12 @@
import re
import mimetypes
import json
from datetime import datetime
from loguru import logger
from slugify import slugify
from auto_archiver.core.metadata import Metadata, Media
from auto_archiver.utils import url as UrlUtil
from auto_archiver.utils import url as UrlUtil, get_datetime_from_str
from auto_archiver.core.extractor import Extractor
from .dropin import GenericDropin, InfoExtractor
@@ -38,7 +37,7 @@ class Twitter(GenericDropin):
try:
if not tweet.get("user") or not tweet.get("created_at"):
raise ValueError("Error retreiving post. Are you sure it exists?")
timestamp = datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
timestamp = get_datetime_from_str(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
except (ValueError, KeyError) as ex:
logger.warning(f"Unable to parse tweet: {str(ex)}\nRetreived tweet data: {tweet}")
return False

View File

@@ -20,7 +20,7 @@
"save_absolute": {
"default": False,
"type": "bool",
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)",
"help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (Warning: saving an absolute path will show your computer's file structure)",
},
},
"description": """

View File

@@ -2,7 +2,6 @@ import json
import re
import mimetypes
import requests
from datetime import datetime
from loguru import logger
from pytwitter import Api
@@ -10,6 +9,7 @@ from slugify import slugify
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import get_datetime_from_str
class TwitterApiExtractor(Extractor):
@@ -91,7 +91,7 @@ class TwitterApiExtractor(Extractor):
result = Metadata()
result.set_title(tweet.data.text)
result.set_timestamp(datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ"))
result.set_timestamp(get_datetime_from_str(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ"))
urls = []
if tweet.includes:

View File

@@ -49,7 +49,7 @@ class CookieSettingDriver(webdriver.Firefox):
self.driver.add_cookie({"name": name, "value": value})
elif self.cookiejar:
domain = urlparse(url).netloc
regex = re.compile(f"(www)?\.?{domain}$")
regex = re.compile(f"(www)?.?{domain}$")
for cookie in self.cookiejar:
if regex.match(cookie.domain):
try:

View File

@@ -118,7 +118,7 @@ def pytest_runtest_setup(item):
pytest.xfail(f"previous test failed ({test_name})")
@pytest.fixture()
@pytest.fixture
def unpickle():
"""
Returns a helper function that unpickles a file

View File

@@ -40,6 +40,22 @@ class TestGenericExtractor(TestExtractorBase):
path = os.path.join(dirname(dirname(__file__)), "data/")
assert self.extractor.dropin_for_name("dropin", additional_paths=[path])
@pytest.mark.parametrize(
"url, suitable_extractors",
[
("https://www.youtube.com/watch?v=5qap5aO4i9A", ["youtube"]),
("https://www.tiktok.com/@funnycats0ftiktok/video/7345101300750748970?lang=en", ["tiktok"]),
("https://www.instagram.com/p/CU1J9JYJ9Zz/", ["instagram"]),
("https://www.facebook.com/nytimes/videos/10160796550110716", ["facebook"]),
("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/", ["facebook"]),
],
)
def test_suitable_extractors(self, url, suitable_extractors):
suitable_extractors = suitable_extractors + ["generic"] # the generic is valid for all
extractors = list(self.extractor.suitable_extractors(url))
assert len(extractors) == len(suitable_extractors)
assert [e.ie_key().lower() for e in extractors] == suitable_extractors
@pytest.mark.parametrize(
"url, is_suitable",
[
@@ -55,7 +71,7 @@ class TestGenericExtractor(TestExtractorBase):
("https://google.com", True),
],
)
def test_suitable_urls(self, make_item, url, is_suitable):
def test_suitable_urls(self, url, is_suitable):
"""
Note: expected behaviour is to return True for all URLs, as YoutubeDLArchiver should be able to handle all URLs
This behaviour may be changed in the future (e.g. if we want the youtubedl archiver to just handle URLs it has extractors for,
@@ -245,3 +261,32 @@ class TestGenericExtractor(TestExtractorBase):
self.assertValidResponseMetadata(post, title, timestamp)
assert len(post.media) == 1
assert post.media[0].hash == image_hash
@pytest.mark.download
def test_download_facebook_video(self, make_item):
post = self.extractor.download(make_item("https://www.facebook.com/bellingcat/videos/588371253839133"))
assert len(post.media) == 2
assert post.media[0].filename.endswith("588371253839133.mp4")
assert post.media[0].mimetype == "video/mp4"
assert post.media[1].filename.endswith(".jpg")
assert post.media[1].mimetype == "image/jpeg"
assert "Bellingchat Premium is with Kolina Koltai" in post.get_title()
@pytest.mark.download
def test_download_facebook_image(self, make_item):
post = self.extractor.download(
make_item("https://www.facebook.com/BylineFest/photos/t.100057299682816/927879487315946/")
)
assert len(post.media) == 1
assert post.media[0].filename.endswith(".png")
assert "Byline Festival - BylineFest Partner" == post.get_title()
@pytest.mark.download
def test_download_facebook_text_only(self, make_item):
url = "https://www.facebook.com/bellingcat/posts/pfbid02rzpwZxAZ8bLkAX8NvHv4DWAidFaqAUfJMbo9vWkpwxL7uMUWzWMiizXLWRSjwihVl"
post = self.extractor.download(make_item(url))
assert "Bellingcat researcher Kolina Koltai delves deeper into Clothoff" in post.get("content")
assert post.get_title() == "Bellingcat"

View File

@@ -4,6 +4,8 @@ import pytest
import yt_dlp
from auto_archiver.modules.generic_extractor.generic_extractor import GenericExtractor
from auto_archiver.modules.generic_extractor.tiktok import Tiktok, TikTokIE
from .test_extractor_base import TestExtractorBase
@@ -17,11 +19,16 @@ def skip_ytdlp_own_methods(mocker):
)
@pytest.fixture()
@pytest.fixture
def mock_get(mocker):
return mocker.patch("auto_archiver.modules.generic_extractor.tiktok.requests.get")
@pytest.fixture
def tiktok_dropin() -> Tiktok:
return Tiktok()
class TestTiktokTikwmExtractor(TestExtractorBase):
"""
Test suite for TestTiktokTikwmExtractor.
@@ -34,6 +41,25 @@ class TestTiktokTikwmExtractor(TestExtractorBase):
VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
@pytest.mark.parametrize(
"url, is_suitable",
[
("https://bellingcat.com", False),
("https://youtube.com", False),
("https://tiktok.co/", False),
("https://tiktok.com/", False),
("https://www.tiktok.com/", False),
("https://api.cool.tiktok.com/", False),
(VALID_EXAMPLE_URL, True),
("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
("https://www.tiktok.com/t/ZP8YQ8e5j/", True),
("https://vt.tiktok.com/ZSMTJeqRP/", True),
],
)
def test_is_suitable(self, url, is_suitable, tiktok_dropin):
assert tiktok_dropin.suitable(url, TikTokIE()) == is_suitable
def test_invalid_json_responses(self, mock_get, make_item, caplog):
mock_get.return_value.status_code = 200
mock_get.return_value.json.side_effect = ValueError