mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 13:48:28 +03:00
minor improvements
This commit is contained in:
@@ -6,7 +6,7 @@ from slugify import slugify
|
|||||||
|
|
||||||
from . import Archiver
|
from . import Archiver
|
||||||
from ..core import Metadata, Media
|
from ..core import Metadata, Media
|
||||||
from ..utils.misc import remove_get_parameters
|
from ..utils import UrlUtil
|
||||||
|
|
||||||
|
|
||||||
class TwitterArchiver(Archiver):
|
class TwitterArchiver(Archiver):
|
||||||
@@ -78,7 +78,7 @@ class TwitterArchiver(Archiver):
|
|||||||
media.set("src", variant.url)
|
media.set("src", variant.url)
|
||||||
mimetype = variant.contentType
|
mimetype = variant.contentType
|
||||||
elif type(tweet_media) == Photo:
|
elif type(tweet_media) == Photo:
|
||||||
media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig'))
|
media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig').replace('name=small', 'name=orig'))
|
||||||
mimetype = "image/jpeg"
|
mimetype = "image/jpeg"
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Could not get media URL of {tweet_media}")
|
logger.warning(f"Could not get media URL of {tweet_media}")
|
||||||
@@ -96,21 +96,7 @@ class TwitterArchiver(Archiver):
|
|||||||
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
|
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
|
||||||
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
|
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
|
||||||
"""
|
"""
|
||||||
headers = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
|
|
||||||
"Accept": "*/*",
|
|
||||||
"Accept-Language": "en-US,en;q=0.5",
|
|
||||||
"Accept-Encoding": "gzip, deflate, br",
|
|
||||||
"Origin": "https://platform.twitter.com",
|
|
||||||
"Connection": "keep-alive",
|
|
||||||
"Referer": "https://platform.twitter.com/",
|
|
||||||
"Sec-Fetch-Dest": "empty",
|
|
||||||
"Sec-Fetch-Mode": "cors",
|
|
||||||
"Sec-Fetch-Site": "cross-site",
|
|
||||||
"Pragma": "no-cache",
|
|
||||||
"Cache-Control": "no-cache",
|
|
||||||
"TE": "trailers"
|
|
||||||
}
|
|
||||||
logger.debug(f"Trying twitter hack for {url=}")
|
logger.debug(f"Trying twitter hack for {url=}")
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
|
|
||||||
@@ -134,7 +120,7 @@ class TwitterArchiver(Archiver):
|
|||||||
media = Media(filename="")
|
media = Media(filename="")
|
||||||
media.set("src", u)
|
media.set("src", u)
|
||||||
ext = ""
|
ext = ""
|
||||||
if (mtype := mimetypes.guess_type(remove_get_parameters(u))[0]):
|
if (mtype := mimetypes.guess_type(UrlUtil.remove_get_parameters(u))[0]):
|
||||||
ext = mimetypes.guess_extension(mtype)
|
ext = mimetypes.guess_extension(mtype)
|
||||||
|
|
||||||
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
|
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}', item)
|
||||||
|
|||||||
@@ -109,6 +109,8 @@ class ArchivingOrchestrator:
|
|||||||
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
|
||||||
result.store()
|
result.store()
|
||||||
|
|
||||||
|
#TODO: remove any duplicate media, if hash is available
|
||||||
|
|
||||||
# 6 - format and store formatted if needed
|
# 6 - format and store formatted if needed
|
||||||
# enrichers typically need access to already stored URLs etc
|
# enrichers typically need access to already stored URLs etc
|
||||||
if (final_media := self.formatter.format(result)):
|
if (final_media := self.formatter.format(result)):
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
|
import traceback
|
||||||
import pdqhash
|
import pdqhash
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image
|
from PIL import Image, UnidentifiedImageError
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from . import Enricher
|
from . import Enricher
|
||||||
@@ -32,11 +33,15 @@ class PdqHashEnricher(Enricher):
|
|||||||
media.set("pdq_hash", hd)
|
media.set("pdq_hash", hd)
|
||||||
|
|
||||||
def calculate_pdq_hash(self, filename):
|
def calculate_pdq_hash(self, filename):
|
||||||
# returns a hexadecimal string with the perceptual hash for the given filename
|
# returns a hexadecimal string with the perceptual hash for the given filename
|
||||||
with Image.open(filename) as img:
|
try:
|
||||||
# convert the image to RGB
|
with Image.open(filename) as img:
|
||||||
image_rgb = np.array(img.convert("RGB"))
|
# convert the image to RGB
|
||||||
# compute the 256-bit PDQ hash (we do not store the quality score)
|
image_rgb = np.array(img.convert("RGB"))
|
||||||
hash_array, _ = pdqhash.compute(image_rgb)
|
# compute the 256-bit PDQ hash (we do not store the quality score)
|
||||||
hash = "".join(str(b) for b in hash_array)
|
hash_array, _ = pdqhash.compute(image_rgb)
|
||||||
return hex(int(hash, 2))[2:]
|
hash = "".join(str(b) for b in hash_array)
|
||||||
|
return hex(int(hash, 2))[2:]
|
||||||
|
except UnidentifiedImageError as e:
|
||||||
|
logger.error(f"Image {filename=} is likely corrupted or in unsupported format {e}: {traceback.format_exc()}")
|
||||||
|
return ""
|
||||||
Reference in New Issue
Block a user