Compare commits

..

11 Commits

Author SHA1 Message Date
Patrick Robertson
82c00d491d Option to provide cookies for use by ytdl, fixes #150 2024-12-20 07:14:49 +01:00
msramalho
e49550163f adds proxy_server option to wacz 2024-10-06 10:45:34 +06:00
msramalho
e6f5981afc numpy version downgrade 2024-10-06 10:10:04 +06:00
msramalho
c62bf1a34d yt-dlp version bump 2024-10-05 17:43:07 +06:00
msramalho
b166d57e61 v0.12.0 bump 2024-08-21 13:34:34 +01:00
msramalho
11c3288267 closes #146 2024-08-21 13:33:58 +01:00
msramalho
004143a58a version bump v0.11.6 2024-07-18 11:27:39 +01:00
msramalho
686f0027c4 adds new entries to example orchestration file 2024-07-18 11:27:15 +01:00
dependabot[bot]
b03cf32c73 Bump authlib from 1.3.0 to 1.3.1 (#144)
Bumps [authlib](https://github.com/lepture/authlib) from 1.3.0 to 1.3.1.
- [Release notes](https://github.com/lepture/authlib/releases)
- [Changelog](https://github.com/lepture/authlib/blob/master/docs/changelog.rst)
- [Commits](https://github.com/lepture/authlib/compare/v1.3.0...v1.3.1)

---
updated-dependencies:
- dependency-name: authlib
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-07-18 11:26:22 +01:00
msramalho
dc9e64397e bumping yt-dlp 2024-07-18 11:23:09 +01:00
msramalho
c7bc5e2988 cleanup 2024-05-15 11:04:29 +01:00
9 changed files with 1003 additions and 1089 deletions

View File

@@ -30,10 +30,10 @@ tqdm = "*"
jinja2 = "*"
cryptography = "*"
dataclasses-json = "*"
yt-dlp = "*"
yt-dlp = "2024.09.27"
vk-url-scraper = "*"
requests = {extras = ["socks"], version = "*"}
numpy = "*"
numpy = "1.26.4"
warcio = "*"
jsonlines = "*"
pysubs2 = "*"

1974
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -47,7 +47,7 @@ Docker works like a virtual machine running inside your computer, it isolates ev
<details><summary><code>Python package instructions</code></summary>
1. make sure you have python 3.8 or higher installed
1. make sure you have python 3.10 or higher installed
2. install the package `pip/pipenv/conda install auto-archiver`
3. test it's installed with `auto-archiver --help`
4. run it with your orchestration file and pass any flags you want in the command line `auto-archiver --config secrets/orchestration.yaml` if your orchestration file is inside a `secrets/`, which we advise
@@ -108,7 +108,7 @@ configurations:
# ... configurations for the other steps here ...
```
To see all available `steps` (which archivers, storages, databses, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
To see all available `steps` (which archivers, storages, databases, ...) exist check the [example.orchestration.yaml](example.orchestration.yaml).
All the `configurations` in the `orchestration.yaml` file (you can name it differently but need to pass it in the `--config FILENAME` argument) can be seen in the console by using the `--help` flag. They can also be overwritten, for example if you are using the `cli_feeder` to archive from the command line and want to provide the URLs you should do:

View File

@@ -16,8 +16,13 @@ steps:
# - wacz_archiver_enricher
enrichers:
- hash_enricher
# - meta_enricher
# - metadata_enricher
# - screenshot_enricher
# - pdq_hash_enricher
# - ssl_enricher
# - timestamping_enricher
# - whisper_enricher
# - thumbnail_enricher
# - wayback_archiver_enricher
# - wacz_archiver_enricher
@@ -89,6 +94,14 @@ configurations:
password: "vk pass"
session_file: "secrets/vk_config.v2.json"
youtubedl_archiver:
subtitles: true
# use one of the following two methods to authenticate in youtube - either provide a cookies file or use the cookies of the given browser
# for more information, see https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp
# cookie_file: "secrets/youtube_cookies.txt"
# cookies_from_browser: firefox
# proxy: socks5://proxy-user:password@proxy-ip:port
screenshot_enricher:
width: 1280
height: 2300

View File

@@ -27,7 +27,7 @@ package_dir=
=src
packages=find:
find_packages=true
python_requires = >=3.8
python_requires = >=3.10
[options.package_data]
* = *.html

View File

@@ -30,6 +30,8 @@ class YoutubeDLArchiver(Archiver):
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
"cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
"cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
}
def download(self, item: Metadata) -> Metadata:
@@ -38,8 +40,17 @@ class YoutubeDLArchiver(Archiver):
if item.netloc in ['facebook.com', 'www.facebook.com'] and self.facebook_cookie:
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
if item.netloc in ['youtube.com', 'www.youtube.com']:
if self.cookies_from_browser:
logger.debug(f'Extracting cookies from browser {self.cookies_from_browser} for Youtube')
ydl_options['cookiesfrombrowser'] = (self.cookies_from_browser,)
elif self.cookie_file:
logger.debug(f'Using cookies from file {self.cookie_file}')
ydl_options['cookiefile'] = self.cookie_file
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
try:

View File

@@ -34,6 +34,7 @@ class WaczArchiverEnricher(Enricher, Archiver):
"extract_screenshot": {"default": True, "help": "If enabled the screenshot captured by browsertrix will be extracted into separate Media and appear in the html report. The .wacz file will be kept untouched."},
"socks_proxy_host": {"default": None, "help": "SOCKS proxy host for browsertrix-crawler, use in combination with socks_proxy_port. eg: user:password@host"},
"socks_proxy_port": {"default": None, "help": "SOCKS proxy port for browsertrix-crawler, use in combination with socks_proxy_host. eg 1234"},
"proxy_server": {"default": None, "help": "SOCKS server proxy URL, in development"},
}
def setup(self) -> None:
@@ -113,7 +114,10 @@ class WaczArchiverEnricher(Enricher, Archiver):
try:
logger.info(f"Running browsertrix-crawler: {' '.join(cmd)}")
my_env = os.environ.copy()
if self.socks_proxy_host and self.socks_proxy_port:
if self.proxy_server:
logger.debug("Using PROXY_SERVER proxy for browsertrix-crawler")
my_env["PROXY_SERVER"] = self.proxy_server
elif self.socks_proxy_host and self.socks_proxy_port:
logger.debug("Using SOCKS proxy for browsertrix-crawler")
my_env["SOCKS_HOST"] = self.socks_proxy_host
my_env["SOCKS_PORT"] = str(self.socks_proxy_port)

View File

@@ -1,9 +1,9 @@
_MAJOR = "0"
_MINOR = "11"
_MINOR = "13"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "5"
_PATCH = "1"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""

72
todo.md
View File

@@ -1,72 +0,0 @@
------ AA + API
2024-03-05 11:57:12.910 | ERROR | auto_archiver.core.orchestrator:archive:116 - ERROR enricher wacz_archiver_enricher: 'WaczArchiverEnricher' object has no attri bute 'browsertrix_home_host': Traceback (most recent call last):
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/core/orchestrator.py", line 114, in archive
try: e.enrich(result)
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/enrichers/wacz_enricher.py", line 70, in enrich
browsertrix_home_host = self.browsertrix_home_host or os.path.abspath(ArchivingContext.get_tmp_dir())
AttributeError: 'WaczArchiverEnricher' object has no attribute 'browsertrix_home_host'
-------- API
2024-02-29 17:12:06.078 | WARNING | worker:task_failure_notifier:100 - 😅 From task_failure_notifier ==> Task failed successfully!
2024-02-29 17:12:06.078 | ERROR | worker:task_failure_notifier:101 - list index out of range
2024-02-29 17:12:06.078 | ERROR | worker:task_failure_notifier:102 - <traceback object at 0x7f3db75446c0>
2024-02-29 17:12:06.079 | ERROR | worker:task_failure_notifier:103 - File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/trace.py", line 412, in trace_task
R = retval = fun(*args, **kwargs)
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/trace.py", line 704, in __protected_call__
return self.run(*args, **kwargs)
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/autoretry.py", line 50, in run
raise task.retry(exc=exc, **retry_kwargs)
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/task.py", line 706, in retry
raise_with_context(exc)
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/celery/app/autoretry.py", line 35, in run
return task._orig_run(*args, **kwargs)
File "/app/worker.py", line 35, in create_archive_task
invalid = is_group_invalid_for_user(archive.public, archive.group_id, archive.author_id)
File "/app/worker.py", line 160, in is_group_invalid_for_user
if not crud.is_user_in_group(session, group_id, author_id):
File "/app/db/crud.py", line 93, in is_user_in_group
return len(group_name) and len(email) and group_name in get_user_groups(db, email)
File "/app/db/crud.py", line 103, in get_user_groups
domain_level_groups = DOMAIN_GROUPS.get(email.split('@')[1], [])
------------------ API
[parameters: (('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/b0c88017bb047ff43fc49907/3811d9d0c74541929f4a72d0.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'profile_picture'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/ff30ece740738d060229c5da/e43172422e274c2a8f9529ff.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'post 3308982791113602520'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/23f218c518e2d5a17fe856bd/bad85f53a8e54c26991cdff9.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790987757405'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/2ffc8c65d6bfec7ef5402bda/520a10e7a7e14028be1cc1c8.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790970975889'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/da0486669cbb102e6221d94c/65b151ee59114ea5b61cfe96.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790979533474'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/d8c4db3247780324b6fa6d4a/df195ff24b104182bf255610.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790979331432'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/715432d8038abe50c8c89994/f49554bf621848e5881388ee.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982791122025152'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/f143ce41b1eb329a0c404448/6b3d604f676c4ef583b54ff3.jpg', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'image 3308982790987824546') ... displaying 10 of 253 total bound parameter sets ... ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/385ac59684e1148643c762de/0d72d23e6652474a977fe1d6', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', 'timestamp_authority_filesno-dups/385ac59684e1148643c762de/0d72d23e6652474a977fe1d6_3.0'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/261cb9cefe3c373ad5f7f305/545253c297124d31bbf817c9.html', '314b2eb3-98a4-4a4b-be01-7f4d3ea1c4c2', '_final_media'))]
2024-02-25 15:43:13.142 | DEBUG | worker:convert_if_media:200 - error parsing {'pk': '2045172809601551458', 'id': '2045172809601551458_178884643', 'code': 'Bxh6dWkloxi', 'taken_at': '2019-05-16T16:20:16Z', 'media_type': 1, 'product_type': 'story', 'thumbnail_url': 'https://scontent-sjc3-1.cdninstagram.com/v/t51.12442-15/58604296_289070981969290_2714055836620897014_n.jpg?stp=dst-jpg_e35&efg=eyJ2ZW5jb2RlX3RhZyI6ImltYWdlX3VybGdlbi4xMDI0eDE4MjAuc2RyIn0&_nc_ht=scontent-sjc3-1.cdninstagram.com&_nc_cat=110&_nc_ohc=ri2YWjVH4dkAX9TFQox&edm=ANmP7GQBAAAA&ccb=7-5&ig_cache_key=MjA0NTE3MjgwOTYwMTU1MTQ1OA%3D%3D.2-ccb7-5&oh=00_AfCIxHr9jkUmeq9NgbmTpWtURV_eu5JGMRbrsc0WwyO59g&oe=65DD0180&_nc_sid=982cc7', 'user': {'pk': '178884643'}, 'locations': [{'location': {'pk': 218723854, 'name': 'Montañita, Ecuador', 'lng': -80.751982661304, 'lat': -1.829127033905}}]} : 'filename'
2024-02-25 15:43:13.224 | WARNING | worker:create_sheet_task:84 - cached result detected: (sqlite3.IntegrityError) UNIQUE constraint failed: archive_urls.url, archive_urls.archive_id
[SQL: INSERT INTO archive_urls (url, archive_id, "key") VALUES (?, ?, ?)]
[parameters: (('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/3caafdcb057000f8c70610fe/e45ec0b09b854333afa22c1c.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'profile_picture'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/58f0b188e55ef64b96d04b35/94cd9b3d2d5c4deebb9ccd13.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'story 3310558716812786504_178884643'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/9225487f1a559da4060dcf8e/fe55f26616114856a35cc357.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'post 3301802228841572226'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/e26b8bc09aa4febc53d4a42d/50330faff71d4dc28078d5e6.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228850013597'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/bec948c1bf5b87f1f0923e1d/e4c3ddef88ad487ab0305402.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228917101263'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/38c66be1f1898ea7e0d20d9d/9143edba90184ea387fc856d.jpg', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'image 3301802228916961531'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/89f16b246931dc98792d570c/859ea90fd5a84d3cbf1a7b46.mp4', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'video 3301801735390140959'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/7ee6f00d8a05e22996307c20/5f5faa30541d406e9f0ea1f2.mp4', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'video 3301801735750883968') ... displaying 10 of 587 total bound parameter sets ... ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/ef3bc69b89e40504cd936e8a/f4a13bf448c04af0b8c3aeae', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', 'timestamp_authority_filesno-dups/ef3bc69b89e40504cd936e8a/f4a13bf448c04af0b8c3aeae_3.0'), ('https://bellingcat-archive.nyc3.cdn.digitaloceanspaces.com/no-dups/561ea3dc87c229eb239266f2/8c292219a03445e396001026.html', '51fe261b-ad32-4c8a-96cd-801ccfa472fa', '_final_media'))]
2024-02-27 13:03:04.585 | ERROR | auto_archiver.core.orchestrator:archive:128 - ERROR database gsheet_db: {'code': 400, 'message': 'Invalid data[4]: Your input contains more than the maximum of 50000 characters in a single cell.', 'status': 'INVALID_ARGUMENT'}: Traceback (most recent call last):
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/core/orchestrator.py", line 126, in archive
try: d.done(result)
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/databases/gsheet_db.py", line 94, in done
gw.batch_set_cell(cell_updates)
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/auto_archiver/utils/gworksheet.py", line 104, in batch_set_cell
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/worksheet.py", line 1361, in batch_update
response = self.client.values_batch_update(self.spreadsheet_id, body=body)
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/http_client.py", line 263, in values_batch_update
r = self.request("post", url, json=body)
File "/root/.local/share/virtualenvs/app-4PlAip0Q/lib/python3.10/site-packages/gspread/http_client.py", line 123, in request
raise APIError(response)
gspread.exceptions.APIError: {'code': 400, 'message': 'Invalid data[4]: Your input contains more than the maximum of 50000 characters in a single cell.', 'status': 'INVALID_ARGUMENT'}