Merge branch 'main' into remove_dependencies

This commit is contained in:
Patrick Robertson
2025-01-14 17:39:20 +01:00
20 changed files with 528 additions and 163 deletions

View File

@@ -114,6 +114,10 @@ class TwitterArchiver(Archiver):
result = Metadata()
tweet = r.json()
if tweet.get('__typename') == 'TweetTombstone':
logger.error(f"Failed to get tweet {tweet_id}: {tweet['tombstone']['text']['text']}")
return False
urls = []
for p in tweet.get("photos", []):
urls.append(p["url"])
@@ -135,7 +139,7 @@ class TwitterArchiver(Archiver):
media.filename = self.download_from_url(u, f'{slugify(url)}_{i}{ext}')
result.add_media(media)
result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result.success("twitter-syndication")
@@ -158,7 +162,8 @@ class TwitterArchiver(Archiver):
.set_timestamp(timestamp)
if not tweet.get("entities", {}).get("media"):
logger.debug('No media found, archiving tweet text only')
return result.success("twitter-ytdl")
result.status = "twitter-ytdl"
return result
for i, tw_media in enumerate(tweet["entities"]["media"]):
media = Media(filename="")
mimetype = ""

View File

@@ -14,9 +14,26 @@ class HashEnricher(Enricher):
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
algo_choices = self.configs()["algorithm"]["choices"]
algos = self.configs()["algorithm"]
algo_choices = algos["choices"]
if not getattr(self, 'algorithm', None):
if not config.get('algorithm'):
logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
self.algorithm = algos["default"]
else:
self.algorithm = config["algorithm"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
if not getattr(self, 'chunksize', None):
if config.get('chunksize'):
self.chunksize = config["chunksize"]
else:
self.chunksize = self.configs()["chunksize"]["default"]
self.chunksize = int(self.chunksize)
assert self.chunksize >= -1, "read length must be non-negative or -1"
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
@staticmethod