mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 13:18:28 +03:00
Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
65e3c99483 | ||
|
|
888ad8f004 | ||
|
|
086a9e6c84 | ||
|
|
4d80ee6f02 | ||
|
|
92569ae6be |
2
Pipfile
2
Pipfile
@@ -30,7 +30,7 @@ tqdm = "*"
|
|||||||
jinja2 = "*"
|
jinja2 = "*"
|
||||||
cryptography = "*"
|
cryptography = "*"
|
||||||
dataclasses-json = "*"
|
dataclasses-json = "*"
|
||||||
yt-dlp = ">=2023.2.17"
|
yt-dlp = "*"
|
||||||
vk-url-scraper = "*"
|
vk-url-scraper = "*"
|
||||||
uwsgi = "*"
|
uwsgi = "*"
|
||||||
requests = {extras = ["socks"], version = "*"}
|
requests = {extras = ["socks"], version = "*"}
|
||||||
|
|||||||
2
Pipfile.lock
generated
2
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "6e76638769e56f28c2cc56e548d3ac1752b36db2160e23a865089c80e584dcba"
|
"sha256": "84ebe4378c02b26d0663f6d7ede49064ec7428dddca668c8d8a5d64cf9191f09"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {
|
"requires": {
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ steps:
|
|||||||
# - thumbnail_enricher
|
# - thumbnail_enricher
|
||||||
# - wayback_archiver_enricher
|
# - wayback_archiver_enricher
|
||||||
# - wacz_enricher
|
# - wacz_enricher
|
||||||
# - pdq_hash_enricher
|
# - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher
|
||||||
formatter: html_formatter # defaults to mute_formatter
|
formatter: html_formatter # defaults to mute_formatter
|
||||||
storages:
|
storages:
|
||||||
- local_storage
|
- local_storage
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ class TelegramArchiver(Archiver):
|
|||||||
video = s.find("video")
|
video = s.find("video")
|
||||||
if video is None:
|
if video is None:
|
||||||
logger.warning("could not find video")
|
logger.warning("could not find video")
|
||||||
image_tags = s.find_all(class_="js-message_photo")
|
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
|
||||||
|
|
||||||
image_urls = []
|
image_urls = []
|
||||||
for im in image_tags:
|
for im in image_tags:
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from slugify import slugify
|
|||||||
|
|
||||||
from . import Archiver
|
from . import Archiver
|
||||||
from ..core import Metadata, Media
|
from ..core import Metadata, Media
|
||||||
|
from ..utils.misc import remove_get_parameters
|
||||||
|
|
||||||
|
|
||||||
class TwitterArchiver(Archiver):
|
class TwitterArchiver(Archiver):
|
||||||
@@ -93,6 +94,7 @@ class TwitterArchiver(Archiver):
|
|||||||
Hack alternative working again.
|
Hack alternative working again.
|
||||||
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
|
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
|
||||||
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
|
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
|
||||||
|
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
|
||||||
"""
|
"""
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
|
||||||
@@ -132,7 +134,7 @@ class TwitterArchiver(Archiver):
|
|||||||
media = Media(filename="")
|
media = Media(filename="")
|
||||||
media.set("src", u)
|
media.set("src", u)
|
||||||
ext = ""
|
ext = ""
|
||||||
if (mtype := mimetypes.guess_type(u)[0]):
|
if (mtype := mimetypes.guess_type(remove_get_parameters(u))[0]):
|
||||||
ext = mimetypes.guess_extension(mtype)
|
ext = mimetypes.guess_extension(mtype)
|
||||||
|
|
||||||
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
|
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
import os, json, requests
|
import os, json, requests
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
|
|
||||||
def mkdir_if_not_exists(folder):
|
def mkdir_if_not_exists(folder):
|
||||||
@@ -20,6 +21,13 @@ def expand_url(url):
|
|||||||
logger.error(f'Failed to expand url {url}')
|
logger.error(f'Failed to expand url {url}')
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
def remove_get_parameters(url):
|
||||||
|
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
|
||||||
|
# useful for mimetypes to work
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
new_url = urlunparse(parsed_url._replace(query=''))
|
||||||
|
return new_url
|
||||||
|
|
||||||
|
|
||||||
def getattr_or(o: object, prop: str, default=None):
|
def getattr_or(o: object, prop: str, default=None):
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
|||||||
_MINOR = "5"
|
_MINOR = "5"
|
||||||
# On main and in a nightly release the patch should be one ahead of the last
|
# On main and in a nightly release the patch should be one ahead of the last
|
||||||
# released build.
|
# released build.
|
||||||
_PATCH = "26"
|
_PATCH = "28"
|
||||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||||
_SUFFIX = ""
|
_SUFFIX = ""
|
||||||
|
|||||||
Reference in New Issue
Block a user