mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-13 05:38:29 +03:00
Further tidyups + refactoring for new structure
* Add implementation tests for orchestrator + logging tests * Standardise method/class vars for extractors to see if they are suitable * Fix bugs with removing default loguru logger (allows further customisation) * Fix bug loading required fields from file *
This commit is contained in:
@@ -28,7 +28,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
# TODO: improvement collect aggregates of locations[0].location and mentions for all posts
|
||||
"""
|
||||
|
||||
global_pattern = re.compile(
|
||||
valid_url = re.compile(
|
||||
r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
|
||||
)
|
||||
|
||||
@@ -44,7 +44,7 @@ class InstagramAPIExtractor(Extractor):
|
||||
url.replace("instagr.com", "instagram.com").replace(
|
||||
"instagr.am", "instagram.com"
|
||||
)
|
||||
insta_matches = self.global_pattern.findall(url)
|
||||
insta_matches = self.valid_url.findall(url)
|
||||
logger.info(f"{insta_matches=}")
|
||||
if not len(insta_matches) or len(insta_matches[0]) != 3:
|
||||
return
|
||||
|
||||
@@ -16,10 +16,13 @@ class InstagramExtractor(Extractor):
|
||||
Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
|
||||
"""
|
||||
# NB: post regex should be tested before profile
|
||||
|
||||
valid_url = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/")
|
||||
|
||||
# https://regex101.com/r/MGPquX/1
|
||||
post_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(?:p|reel)\/(\w+)")
|
||||
post_pattern = re.compile(r"{valid_url}(?:p|reel)\/(\w+)".format(valid_url=valid_url))
|
||||
# https://regex101.com/r/6Wbsxa/1
|
||||
profile_pattern = re.compile(r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com|instagr.am|instagr.com)\/(\w+)")
|
||||
profile_pattern = re.compile(r"{valid_url}(\w+)".format(valid_url=valid_url))
|
||||
# TODO: links to stories
|
||||
|
||||
def setup(self, config: dict) -> None:
|
||||
|
||||
@@ -14,7 +14,7 @@ from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
class TelethonArchiver(Extractor):
|
||||
link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||
|
||||
|
||||
@@ -92,7 +92,7 @@ class TelethonArchiver(Extractor):
|
||||
"""
|
||||
url = item.get_url()
|
||||
# detect URLs that we definitely cannot handle
|
||||
match = self.link_pattern.search(url)
|
||||
match = self.valid_url.search(url)
|
||||
logger.debug(f"TELETHON: {match=}")
|
||||
if not match: return False
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata,Media
|
||||
|
||||
class TwitterApiExtractor(Extractor):
|
||||
link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
valid_url = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")
|
||||
|
||||
def setup(self, config: dict) -> None:
|
||||
super().setup(config)
|
||||
@@ -54,7 +54,7 @@ class TwitterApiExtractor(Extractor):
|
||||
|
||||
def get_username_tweet_id(self, url):
|
||||
# detect URLs that we definitely cannot handle
|
||||
matches = self.link_pattern.findall(url)
|
||||
matches = self.valid_url.findall(url)
|
||||
if not len(matches): return False, False
|
||||
|
||||
username, tweet_id = matches[0] # only one URL supported
|
||||
|
||||
Reference in New Issue
Block a user