Further tidyups + refactoring for new structure

* Add implementation tests for orchestrator + logging tests
* Standardise method/class vars for extractors to see if they are suitable
* Fix bugs with removing default loguru logger (allows further customisation)
* Fix bug loading required fields from file
*
This commit is contained in:
Patrick Robertson
2025-01-30 13:21:10 +01:00
parent cddae65a90
commit b7d9145f6c
22 changed files with 292 additions and 51 deletions

View File

@@ -28,7 +28,7 @@ class InstagramAPIExtractor(Extractor):
# TODO: improvement collect aggregates of locations[0].location and mentions for all posts
"""
global_pattern = re.compile(
valid_url = re.compile(
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
)
@@ -44,7 +44,7 @@ class InstagramAPIExtractor(Extractor):
url.replace("instagr.com", "instagram.com").replace(
"instagr.am", "instagram.com"
)
insta_matches = self.global_pattern.findall(url)
insta_matches = self.valid_url.findall(url)
logger.info(f"{insta_matches=}")
if not len(insta_matches) or len(insta_matches[0]) != 3:
return

View File

@@ -16,10 +16,13 @@ class InstagramExtractor(Extractor):
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
"""
# NB: post regex should be tested before profile
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
# https://regex101.com/r/MGPquX/1
post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
# https://regex101.com/r/6Wbsxa/1
profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)")
profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
# TODO: links to stories
def setup(self, config: dict) -> None:

View File

@@ -14,7 +14,7 @@ from auto_archiver.utils import random_str
class TelethonArchiver(Extractor):
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
@@ -92,7 +92,7 @@ class TelethonArchiver(Extractor):
"""
url = item.get_url()
# detect URLs that we definitely cannot handle
match = self.link_pattern.search(url)
match = self.valid_url.search(url)
logger.debug(f"TELETHON: {match=}")
if not match: return False

View File

@@ -12,7 +12,7 @@ from auto_archiver.core import Extractor
from auto_archiver.core import Metadata,Media
class TwitterApiExtractor(Extractor):
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
valid_url = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
def setup(self, config: dict) -> None:
super().setup(config)
@@ -54,7 +54,7 @@ class TwitterApiExtractor(Extractor):
def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle
matches = self.link_pattern.findall(url)
matches = self.valid_url.findall(url)
if not len(matches): return False, False
username, tweet_id = matches[0] # only one URL supported