mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-10 12:18:30 +03:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
65e3c99483 | ||
|
|
888ad8f004 | ||
|
|
086a9e6c84 |
@@ -18,7 +18,7 @@ steps:
|
||||
# - thumbnail_enricher
|
||||
# - wayback_archiver_enricher
|
||||
# - wacz_enricher
|
||||
# - pdq_hash_enricher
|
||||
# - pdq_hash_enricher # if you want to calculate hashes for thumbnails, include this after thumbnail_enricher
|
||||
formatter: html_formatter # defaults to mute_formatter
|
||||
storages:
|
||||
- local_storage
|
||||
|
||||
@@ -49,7 +49,6 @@ class TelegramArchiver(Archiver):
|
||||
if video is None:
|
||||
logger.warning("could not find video")
|
||||
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
|
||||
logger.info(image_tags)
|
||||
|
||||
image_urls = []
|
||||
for im in image_tags:
|
||||
|
||||
@@ -6,6 +6,7 @@ from slugify import slugify
|
||||
|
||||
from . import Archiver
|
||||
from ..core import Metadata, Media
|
||||
from ..utils.misc import remove_get_parameters
|
||||
|
||||
|
||||
class TwitterArchiver(Archiver):
|
||||
@@ -133,7 +134,7 @@ class TwitterArchiver(Archiver):
|
||||
media = Media(filename="")
|
||||
media.set("src", u)
|
||||
ext = ""
|
||||
if (mtype := mimetypes.guess_type(u)[0]):
|
||||
if (mtype := mimetypes.guess_type(remove_get_parameters(u))[0]):
|
||||
ext = mimetypes.guess_extension(mtype)
|
||||
|
||||
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
import os, json, requests
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
|
||||
def mkdir_if_not_exists(folder):
|
||||
@@ -20,6 +21,13 @@ def expand_url(url):
|
||||
logger.error(f'Failed to expand url {url}')
|
||||
return url
|
||||
|
||||
def remove_get_parameters(url):
|
||||
# http://example.com/file.mp4?t=1 -> http://example.com/file.mp4
|
||||
# useful for mimetypes to work
|
||||
parsed_url = urlparse(url)
|
||||
new_url = urlunparse(parsed_url._replace(query=''))
|
||||
return new_url
|
||||
|
||||
|
||||
def getattr_or(o: object, prop: str, default=None):
|
||||
try:
|
||||
|
||||
@@ -3,7 +3,7 @@ _MAJOR = "0"
|
||||
_MINOR = "5"
|
||||
# On main and in a nightly release the patch should be one ahead of the last
|
||||
# released build.
|
||||
_PATCH = "27"
|
||||
_PATCH = "28"
|
||||
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
|
||||
# https://semver.org/#is-v123-a-semantic-version for the semantics.
|
||||
_SUFFIX = ""
|
||||
|
||||
Reference in New Issue
Block a user