mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-11 12:48:28 +03:00
Compare commits
11 Commits
v0.11.5
...
youtube-co
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
82c00d491d | ||
|
|
e49550163f | ||
|
|
e6f5981afc | ||
|
|
c62bf1a34d | ||
|
|
b166d57e61 | ||
|
|
11c3288267 | ||
|
|
004143a58a | ||
|
|
686f0027c4 | ||
|
|
b03cf32c73 | ||
|
|
dc9e64397e | ||
|
|
c7bc5e2988 |
4
Pipfile
4
Pipfile
@@ -30,10 +30,10 @@ tqdm = "*"
|
||||
jinja2 = "*"
|
||||
cryptography = "*"
|
||||
dataclasses-json = "*"
|
||||
yt-dlp = "*"
|
||||
yt-dlp = "2024.09.27"
|
||||
vk-url-scraper = "*"
|
||||
requests = {extras = ["socks"], version = "*"}
|
||||
numpy = "*"
|
||||
numpy = "1.26.4"
|
||||
warcio = "*"
|
||||
jsonlines = "*"
|
||||
pysubs2 = "*"
|
||||
|
||||
1974
Pipfile.lock
generated
1974
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -47,7 +47,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
|
||||
|
||||
<details><summary><code>Python package instructions</code></summary>
|
||||
|
||||
1. make sure you have python 3.8 or higher installed
|
||||
1. make sure you have python 3.10 or higher installed
|
||||
2. install the package `pip/pipenv/conda install auto-archiver`
|
||||
3. test it's installed with `auto-archiver --help`
|
||||
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
|
||||
@@ -108,7 +108,7 @@ configurations:
|
||||
# ... configurations for the other steps here ...
|
||||
```
|
||||
|
||||
To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
|
||||
To see all available `steps` (which archivers, storages, databases, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
|
||||
|
||||
All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:
|
||||
|
||||
|
||||
@@ -16,8 +16,13 @@ steps:
|
||||
# - wacz_archiver_enricher
|
||||
enrichers:
|
||||
- hash_enricher
|
||||
# - meta_enricher
|
||||
# - metadata_enricher
|
||||
# - screenshot_enricher
|
||||
# - pdq_hash_enricher
|
||||
# - ssl_enricher
|
||||
# - timestamping_enricher
|
||||
# - whisper_enricher
|
||||
# - thumbnail_enricher
|
||||
# - wayback_archiver_enricher
|
||||
# - wacz_archiver_enricher
|
||||
@@ -89,6 +94,14 @@ configurations:
|
||||
password: "vk pass"
|
||||
session_file: "secrets/vk_config.v2.json"
|
||||
|
||||
youtubedl_archiver:
|
||||
subtitles: true
|
||||
# use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
|
||||
# for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
|
||||
# cookie_file: "secrets/youtube_cookies.txt"
|
||||
# cookies_from_browser: firefox
|
||||
# proxy: socks5://proxy-user:password@proxy-ip:port
|
||||
|
||||
screenshot_enricher:
|
||||
width: 1280
|
||||
height: 2300
|
||||
|
||||
@@ -27,7 +27,7 @@ package_dir=
|
||||
=src
|
||||
packages=find:
|
||||
find_packages=true
|
||||
python_requires = >=3.8
|
||||
python_requires = >=3.10
|
||||
|
||||
[options.package_data]
|
||||
* = *.html
|
||||
|
||||
@@ -30,6 +30,8 @@ class YoutubeDLArchiver(Archiver):
|
||||
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
|
||||
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
|
||||
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
|
||||
"cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
|
||||
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
|
||||
}
|
||||
|
||||
def download(self, item: Metadata) -> Metadata:
|
||||
@@ -38,8 +40,17 @@ class YoutubeDLArchiver(Archiver):
|
||||
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
|
||||
logger.debug('Using Facebook cookie')
|
||||
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
|
||||
|
||||
|
||||
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
|
||||
|
||||
if item.netloc in ['youtube.com', 'www.youtube.com']:
|
||||
if self.cookies_from_browser:
|
||||
logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
|
||||
ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
|
||||
elif self.cookie_file:
|
||||
logger.debug(f'Using cookies from file {self.cookie_file}')
|
||||
ydl_options['cookiefile'] = self.cookie_file
|
||||
|
||||
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
|
||||
|
||||
try:
|
||||
|
||||
@@ -34,6 +34,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
|
||||
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
|
||||
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
|
||||
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
|
||||
}
|
||||
|
||||
def setup(self) -> None:
|
||||
@@ -113,7 +114,10 @@ class WaczArchiverEnricher(Enricher, Archiver):
|
||||
try:
|
||||
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
|
||||
my_env = os.environ.copy()
|
||||
if self.socks_proxy_host and self.socks_proxy_port:
|
||||
if self.proxy_server:
|
||||
logger.debug("Using PROXY_SERVER proxy for browsertrix-crawler")
|
||||
my_env["PROXY_SERVER"] = self.proxy_server
|
||||
elif self.socks_proxy_host and self.socks_proxy_port:
|
||||
logger.debug("Using SOCKS proxy for browsertrix-crawler")
|
||||
my_env["SOCKS_HOST"] = self.socks_proxy_host
|
||||
my_env["SOCKS_PORT"] = str(self.socks_proxy_port)
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
|
||||
_MAJOR = "0"
|
||||
_MINOR = "11"
|
||||
_MINOR = "13"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "5"
|
||||
_PATCH = "1"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
72
todo.md
72
todo.md
@@ -1,72 +0,0 @@
|
||||
------ AA + API
|
||||
|
||||
|
||||
|
||||
2024-03-05 11:57:12.910 | ERROR | auto_archiver.core.orchestrator:archive:116 - ERROR enricher wacz_archiver_enricher: 'WaczArchiverEnricher' object has no attri bute 'browsertrix_home_host': Traceback (most recent call last):
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/core/orchestrator.py", line 114, in archive
|
||||
try: e.enrich(result)
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/enrichers/wacz_enricher.py", line 70, in enrich
|
||||
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
|
||||
AttributeError: 'WaczArchiverEnricher' object has no attribute 'browsertrix_home_host'
|
||||
|
||||
|
||||
|
||||
-------- API
|
||||
|
||||
|
||||
2024-02-29 17:12:06.078 | WARNING | worker:task_failure_notifier:100 - 😅 From task_failure_notifier ==> Task failed successfully!
|
||||
2024-02-29 17:12:06.078 | ERROR | worker:task_failure_notifier:101 - list index out of range
|
||||
2024-02-29 17:12:06.078 | ERROR | worker:task_failure_notifier:102 - <traceback object at 0x7f3db75446c0>
|
||||
2024-02-29 17:12:06.079 | ERROR | worker:task_failure_notifier:103 - File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/trace.py", line 412, in trace_task
|
||||
R = retval = fun(*args, **kwargs)
|
||||
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/trace.py", line 704, in __protected_call__
|
||||
return self.run(*args, **kwargs)
|
||||
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/autoretry.py", line 50, in run
|
||||
raise task.retry(exc=exc, **retry_kwargs)
|
||||
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/task.py", line 706, in retry
|
||||
raise_with_context(exc)
|
||||
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/autoretry.py", line 35, in run
|
||||
return task._orig_run(*args, **kwargs)
|
||||
|
||||
File "/app/worker.py", line 35, in create_archive_task
|
||||
invalid = is_group_invalid_for_user(archive.public, archive.group_id, archive.author_id)
|
||||
|
||||
File "/app/worker.py", line 160, in is_group_invalid_for_user
|
||||
if not crud.is_user_in_group(session, group_id, author_id):
|
||||
|
||||
File "/app/db/crud.py", line 93, in is_user_in_group
|
||||
return len(group_name) and len(email) and group_name in get_user_groups(db, email)
|
||||
|
||||
File "/app/db/crud.py", line 103, in get_user_groups
|
||||
domain_level_groups = DOMAIN_GROUPS.get(email.split('@')[1], [])
|
||||
|
||||
------------------ API
|
||||
|
||||
[parameters: (('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/b0c88017bb047ff43fc49907/3811d9d0c74541929f4a72d0.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'profile_picture'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/ff30ece740738d060229c5da/e43172422e274c2a8f9529ff.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'post 3308982791113602520'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/23f218c518e2d5a17fe856bd/bad85f53a8e54c26991cdff9.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790987757405'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/2ffc8c65d6bfec7ef5402bda/520a10e7a7e14028be1cc1c8.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790970975889'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/da0486669cbb102e6221d94c/65b151ee59114ea5b61cfe96.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790979533474'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/d8c4db3247780324b6fa6d4a/df195ff24b104182bf255610.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790979331432'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/715432d8038abe50c8c89994/f49554bf621848e5881388ee.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982791122025152'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/f143ce41b1eb329a0c404448/6b3d604f676c4ef583b54ff3.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790987824546') ... displaying 10 of 253 total bound parameter sets ... ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/385ac59684e1148643c762de/0d72d23e6652474a977fe1d6', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'timestamp_authority_filesno-dups/385ac59684e1148643c762de/0d72d23e6652474a977fe1d6_3.0'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/261cb9cefe3c373ad5f7f305/545253c297124d31bbf817c9.html', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', '_final_media'))]
|
||||
|
||||
|
||||
2024-02-25 15:43:13.142 | DEBUG | worker:convert_if_media:200 - error parsing {'pk': '2045172809601551458', 'id': '2045172809601551458_178884643', 'code': 'Bxh6dWkloxi', 'taken_at': '2019-05-16T16:20:16Z', 'media_type': 1, 'product_type': 'story', 'thumbnail_url': 'https://scontent-sjc3-1.cdninstagram.com/v/t51.12442-15/58604296_289070981969290_2714055836620897014_n.jpg?stp=dst-jpg_e35&efg=eyJ2ZW5jb2RlX3RhZyI6ImltYWdlX3VybGdlbi4xMDI0eDE4MjAuc2RyIn0&_nc_ht=scontent-sjc3-1.cdninstagram.com&_nc_cat=110&_nc_ohc=ri2YWjVH4dkAX9TFQox&edm=ANmP7GQBAAAA&ccb=7-5&ig_cache_key=MjA0NTE3MjgwOTYwMTU1MTQ1OA%3D%3D.2-ccb7-5&oh=00_AfCIxHr9jkUmeq9NgbmTpWtURV_eu5JGMRbrsc0WwyO59g&oe=65DD0180&_nc_sid=982cc7', 'user': {'pk': '178884643'}, 'locations': [{'location': {'pk': 218723854, 'name': 'Montañita, Ecuador', 'lng': -80.751982661304, 'lat': -1.829127033905}}]} : 'filename'
|
||||
2024-02-25 15:43:13.224 | WARNING | worker:create_sheet_task:84 - cached result detected: (sqlite3.IntegrityError) UNIQUE constraint failed: archive_urls.url, archive_urls.archive_id
|
||||
[SQL: INSERT INTO archive_urls (url, archive_id, "key") VALUES (?, ?, ?)]
|
||||
[parameters: (('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/3caafdcb057000f8c70610fe/e45ec0b09b854333afa22c1c.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'profile_picture'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/58f0b188e55ef64b96d04b35/94cd9b3d2d5c4deebb9ccd13.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'story 3310558716812786504_178884643'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/9225487f1a559da4060dcf8e/fe55f26616114856a35cc357.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'post 3301802228841572226'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/e26b8bc09aa4febc53d4a42d/50330faff71d4dc28078d5e6.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228850013597'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/bec948c1bf5b87f1f0923e1d/e4c3ddef88ad487ab0305402.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228917101263'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/38c66be1f1898ea7e0d20d9d/9143edba90184ea387fc856d.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228916961531'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/89f16b246931dc98792d570c/859ea90fd5a84d3cbf1a7b46.mp4', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'video 3301801735390140959'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/7ee6f00d8a05e22996307c20/5f5faa30541d406e9f0ea1f2.mp4', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'video 3301801735750883968') ... displaying 10 of 587 total bound parameter sets ... ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/ef3bc69b89e40504cd936e8a/f4a13bf448c04af0b8c3aeae', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'timestamp_authority_filesno-dups/ef3bc69b89e40504cd936e8a/f4a13bf448c04af0b8c3aeae_3.0'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/561ea3dc87c229eb239266f2/8c292219a03445e396001026.html', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', '_final_media'))]
|
||||
|
||||
|
||||
|
||||
2024-02-27 13:03:04.585 | ERROR | auto_archiver.core.orchestrator:archive:128 - ERROR database gsheet_db: {'code': 400, 'message': 'Invalid data[4]: Your input contains more than the maximum of 50000 characters in a single cell.', 'status': 'INVALID_ARGUMENT'}: Traceback (most recent call last):
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/core/orchestrator.py", line 126, in archive
|
||||
try: d.done(result)
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/databases/gsheet_db.py", line 94, in done
|
||||
gw.batch_set_cell(cell_updates)
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/utils/gworksheet.py", line 104, in batch_set_cell
|
||||
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/worksheet.py", line 1361, in batch_update
|
||||
response = self.client.values_batch_update(self.spreadsheet_id, body=body)
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/http_client.py", line 263, in values_batch_update
|
||||
r = self.request("post", url, json=body)
|
||||
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/http_client.py", line 123, in request
|
||||
raise APIError(response)
|
||||
gspread.exceptions.APIError: {'code': 400, 'message': 'Invalid data[4]: Your input contains more than the maximum of 50000 characters in a single cell.', 'status': 'INVALID_ARGUMENT'}
|
||||
Reference in New Issue
Block a user