mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 05:08:28 +03:00
Merge branch 'load_modules' into docs_update
This commit is contained in:
@@ -68,7 +68,7 @@ class GsheetsFeeder(Feeder):
|
||||
folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title))
|
||||
|
||||
m.set_context('folder', folder)
|
||||
m.set_context('worksheet', {"row": row, "worksheet": gw})
|
||||
m.set_context('gsheet', {"row": row, "worksheet": gw})
|
||||
yield m
|
||||
|
||||
logger.success(f'Finished worksheet {wks.title}')
|
||||
|
||||
@@ -12,6 +12,7 @@ from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils.misc import calculate_file_hash
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
@@ -29,15 +30,10 @@ class HashEnricher(Enricher):
|
||||
to_enrich.media[i].set("hash", f"{self.algorithm}:{hd}")
|
||||
|
||||
def calculate_hash(self, filename) -> str:
|
||||
hash = None
|
||||
hash_algo = None
|
||||
if self.algorithm == "SHA-256":
|
||||
hash = hashlib.sha256()
|
||||
hash_algo = hashlib.sha256
|
||||
elif self.algorithm == "SHA3-512":
|
||||
hash = hashlib.sha3_512()
|
||||
hash_algo = hashlib.sha3_512
|
||||
else: return ""
|
||||
with open(filename, "rb") as f:
|
||||
while True:
|
||||
buf = f.read(self.chunksize)
|
||||
if not buf: break
|
||||
hash.update(buf)
|
||||
return hash.hexdigest()
|
||||
return calculate_file_hash(filename, hash_algo, self.chunksize)
|
||||
|
||||
@@ -7,12 +7,11 @@ from loguru import logger
|
||||
|
||||
from auto_archiver.core import Media
|
||||
from auto_archiver.core import Storage
|
||||
from auto_archiver.modules.hash_enricher import HashEnricher
|
||||
from auto_archiver.utils.misc import random_str
|
||||
from auto_archiver.utils.misc import calculate_file_hash, random_str
|
||||
|
||||
NO_DUPLICATES_FOLDER = "no-dups/"
|
||||
|
||||
class S3Storage(Storage, HashEnricher):
|
||||
class S3Storage(Storage):
|
||||
|
||||
def setup(self, config: dict) -> None:
|
||||
super().setup(config)
|
||||
@@ -42,14 +41,13 @@ class S3Storage(Storage, HashEnricher):
|
||||
extra_args['ContentType'] = media.mimetype
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
return True
|
||||
|
||||
def is_upload_needed(self, media: Media) -> bool:
|
||||
if self.random_no_duplicate:
|
||||
# checks if a folder with the hash already exists, if so it skips the upload
|
||||
hd = self.calculate_hash(media.filename)
|
||||
hd = calculate_file_hash(media.filename)
|
||||
path = os.path.join(NO_DUPLICATES_FOLDER, hd[:24])
|
||||
|
||||
if existing_key:=self.file_in_folder(path):
|
||||
|
||||
@@ -1 +1 @@
|
||||
from .telethon_extractor import TelethonArchiver
|
||||
from .telethon_extractor import TelethonExtractor
|
||||
@@ -6,14 +6,14 @@ from telethon.tl.functions.messages import ImportChatInviteRequest
|
||||
from telethon.errors.rpcerrorlist import UserAlreadyParticipantError, FloodWaitError, InviteRequestSentError, InviteHashExpiredError
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
import re, time, json, os
|
||||
import re, time, os
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
class TelethonArchiver(Extractor):
|
||||
class TelethonExtractor(Extractor):
|
||||
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user