mirror of
https://github.com/bellingcat/auto-archiver.git
synced 2026-06-12 21:28:29 +03:00
Merge tests from version with context.
This commit is contained in:
@@ -12,10 +12,11 @@ from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
|
||||
class GsheetsDb(Database):
|
||||
"""
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
NB: only works if GsheetFeeder is used.
|
||||
could be updated in the future to support non-GsheetFeeder metadata
|
||||
"""
|
||||
|
||||
|
||||
def started(self, item: Metadata) -> None:
|
||||
logger.warning(f"STARTED {item}")
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
@@ -57,7 +58,7 @@ class GsheetsDb(Database):
|
||||
media: Media = item.get_final_media()
|
||||
if hasattr(media, "urls"):
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||
batch_if_valid('date', True, self._get_current_datetime_iso())
|
||||
batch_if_valid('title', item.get_title())
|
||||
batch_if_valid('text', item.get("content", ""))
|
||||
batch_if_valid('timestamp', item.get_timestamp())
|
||||
@@ -85,6 +86,12 @@ class GsheetsDb(Database):
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
@staticmethod
|
||||
def _get_current_datetime_iso() -> str:
|
||||
"""Helper method to generate the current datetime in ISO format."""
|
||||
return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
|
||||
try:
|
||||
gw, row = self._retrieve_gsheet(item)
|
||||
@@ -93,9 +100,11 @@ class GsheetsDb(Database):
|
||||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
# todo doesn't exist, should be passed from
|
||||
elif self.sheet_id:
|
||||
print(self.sheet_id)
|
||||
|
||||
|
||||
@@ -34,19 +34,30 @@ class InstagramTbotExtractor(Extractor):
|
||||
"""
|
||||
super().setup(configs)
|
||||
logger.info(f"SETUP {self.name} checking login...")
|
||||
self._prepare_session_file()
|
||||
self._initialize_telegram_client()
|
||||
|
||||
# make a copy of the session that is used exclusively with this archiver instance
|
||||
def _prepare_session_file(self):
|
||||
"""
|
||||
Creates a copy of the session file for exclusive use with this archiver instance.
|
||||
Ensures that a valid session file exists before proceeding.
|
||||
"""
|
||||
new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
|
||||
if not os.path.exists(f"{self.session_file}.session"):
|
||||
raise FileNotFoundError(f"session file {self.session_file}.session not found, "
|
||||
f"to set this up run the setup script in scripts/telegram_setup.py")
|
||||
raise FileNotFoundError(f"Session file {self.session_file}.session not found.")
|
||||
shutil.copy(self.session_file + ".session", new_session_file)
|
||||
self.session_file = new_session_file.replace(".session", "")
|
||||
|
||||
def _initialize_telegram_client(self):
|
||||
"""Initializes the Telegram client."""
|
||||
try:
|
||||
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
|
||||
except OperationalError as e:
|
||||
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
|
||||
logger.error(
|
||||
f"Unable to access the {self.session_file} session. "
|
||||
"Ensure that you don't use the same session file here and in telethon_extractor. "
|
||||
"If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
|
||||
)
|
||||
with self.client.start():
|
||||
logger.success(f"SETUP {self.name} login works.")
|
||||
|
||||
@@ -63,32 +74,49 @@ class InstagramTbotExtractor(Extractor):
|
||||
result = Metadata()
|
||||
tmp_dir = self.tmp_dir
|
||||
with self.client.start():
|
||||
chat = self.client.get_entity("instagram_load_bot")
|
||||
since_id = self.client.send_message(entity=chat, message=url).id
|
||||
|
||||
attempts = 0
|
||||
seen_media = []
|
||||
message = ""
|
||||
time.sleep(3)
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
since_id = max(since_id, post.id)
|
||||
if post.media and post.id not in seen_media:
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
||||
media = self.client.download_media(post.media, filename_dest)
|
||||
if media:
|
||||
result.add_media(Media(media))
|
||||
seen_media.append(post.id)
|
||||
if post.message: message += post.message
|
||||
chat, since_id = self._send_url_to_bot(url)
|
||||
message = self._process_messages(chat, since_id, tmp_dir, result)
|
||||
|
||||
if "You must enter a URL to a post" in message:
|
||||
if "You must enter a URL to a post" in message:
|
||||
logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
return False
|
||||
|
||||
# # TODO: It currently returns this as a success - is that intentional?
|
||||
# if "Media not found or unavailable" in message:
|
||||
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
|
||||
# return False
|
||||
|
||||
if message:
|
||||
result.set_content(message).set_title(message[:128])
|
||||
|
||||
return result.success("insta-via-bot")
|
||||
|
||||
def _send_url_to_bot(self, url: str):
|
||||
"""
|
||||
Sends the URL to the 'instagram_load_bot' and returns (chat, since_id).
|
||||
"""
|
||||
chat = self.client.get_entity("instagram_load_bot")
|
||||
since_message = self.client.send_message(entity=chat, message=url)
|
||||
return chat, since_message.id
|
||||
|
||||
def _process_messages(self, chat, since_id, tmp_dir, result):
|
||||
attempts = 0
|
||||
seen_media = []
|
||||
message = ""
|
||||
time.sleep(3)
|
||||
# media is added before text by the bot so it can be used as a stop-logic mechanism
|
||||
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
|
||||
attempts += 1
|
||||
time.sleep(1)
|
||||
for post in self.client.iter_messages(chat, min_id=since_id):
|
||||
since_id = max(since_id, post.id)
|
||||
# Skip known filler message:
|
||||
if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi':
|
||||
continue
|
||||
if post.media and post.id not in seen_media:
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
|
||||
media = self.client.download_media(post.media, filename_dest)
|
||||
if media:
|
||||
result.add_media(Media(media))
|
||||
seen_media.append(post.id)
|
||||
if post.message: message += post.message
|
||||
return message.strip()
|
||||
@@ -1 +1 @@
|
||||
from .telethon_extractor import TelethonArchiver
|
||||
from .telethon_extractor import TelethonExtractor
|
||||
@@ -13,7 +13,7 @@ from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
class TelethonArchiver(Extractor):
|
||||
class TelethonExtractor(Extractor):
|
||||
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
|
||||
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user