Merge tests from version with context.

This commit is contained in:
erinhmclark
2025-02-05 16:42:58 +00:00
parent 91ca325fd5
commit 52542812dc
13 changed files with 1022 additions and 33 deletions

View File

@@ -12,10 +12,11 @@ from auto_archiver.modules.gsheet_feeder import GWorksheet
class GsheetsDb(Database):
"""
NB: only works if GsheetFeeder is used.
could be updated in the future to support non-GsheetFeeder metadata
NB: only works if GsheetFeeder is used.
could be updated in the future to support non-GsheetFeeder metadata
"""
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")
gw, row = self._retrieve_gsheet(item)
@@ -57,7 +58,7 @@ class GsheetsDb(Database):
media: Media = item.get_final_media()
if hasattr(media, "urls"):
batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('date', True, self._get_current_datetime_iso())
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", ""))
batch_if_valid('timestamp', item.get_timestamp())
@@ -85,6 +86,12 @@ class GsheetsDb(Database):
gw.batch_set_cell(cell_updates)
@staticmethod
def _get_current_datetime_iso() -> str:
"""Helper method to generate the current datetime in ISO format."""
return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=datetime.timezone.utc).isoformat()
def _safe_status_update(self, item: Metadata, new_status: str) -> None:
try:
gw, row = self._retrieve_gsheet(item)
@@ -93,9 +100,11 @@ class GsheetsDb(Database):
logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
if gsheet := item.get_context("gsheet"):
gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row")
# todo doesn't exist, should be passed from
elif self.sheet_id:
print(self.sheet_id)

View File

@@ -34,19 +34,30 @@ class InstagramTbotExtractor(Extractor):
"""
super().setup(configs)
logger.info(f"SETUP {self.name} checking login...")
self._prepare_session_file()
self._initialize_telegram_client()
# make a copy of the session that is used exclusively with this archiver instance
def _prepare_session_file(self):
"""
Creates a copy of the session file for exclusive use with this archiver instance.
Ensures that a valid session file exists before proceeding.
"""
new_session_file = os.path.join("secrets/", f"instabot-{time.strftime('%Y-%m-%d')}{random_str(8)}.session")
if not os.path.exists(f"{self.session_file}.session"):
raise FileNotFoundError(f"session file {self.session_file}.session not found, "
f"to set this up run the setup script in scripts/telegram_setup.py")
raise FileNotFoundError(f"Session file {self.session_file}.session not found.")
shutil.copy(self.session_file + ".session", new_session_file)
self.session_file = new_session_file.replace(".session", "")
def _initialize_telegram_client(self):
"""Initializes the Telegram client."""
try:
self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
except OperationalError as e:
logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
logger.error(
f"Unable to access the {self.session_file} session. "
"Ensure that you don't use the same session file here and in telethon_extractor. "
"If you do, disable at least one of the archivers for the first-time setup of the telethon session: {e}"
)
with self.client.start():
logger.success(f"SETUP {self.name} login works.")
@@ -63,32 +74,49 @@ class InstagramTbotExtractor(Extractor):
result = Metadata()
tmp_dir = self.tmp_dir
with self.client.start():
chat = self.client.get_entity("instagram_load_bot")
since_id = self.client.send_message(entity=chat, message=url).id
attempts = 0
seen_media = []
message = ""
time.sleep(3)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):
since_id = max(since_id, post.id)
if post.media and post.id not in seen_media:
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
media = self.client.download_media(post.media, filename_dest)
if media:
result.add_media(Media(media))
seen_media.append(post.id)
if post.message: message += post.message
chat, since_id = self._send_url_to_bot(url)
message = self._process_messages(chat, since_id, tmp_dir, result)
if "You must enter a URL to a post" in message:
if "You must enter a URL to a post" in message:
logger.debug(f"invalid link {url=} for {self.name}: {message}")
return False
# # TODO: It currently returns this as a success - is that intentional?
# if "Media not found or unavailable" in message:
# logger.debug(f"invalid link {url=} for {self.name}: {message}")
# return False
if message:
result.set_content(message).set_title(message[:128])
return result.success("insta-via-bot")
def _send_url_to_bot(self, url: str):
"""
Sends the URL to the 'instagram_load_bot' and returns (chat, since_id).
"""
chat = self.client.get_entity("instagram_load_bot")
since_message = self.client.send_message(entity=chat, message=url)
return chat, since_message.id
def _process_messages(self, chat, since_id, tmp_dir, result):
attempts = 0
seen_media = []
message = ""
time.sleep(3)
# media is added before text by the bot so it can be used as a stop-logic mechanism
while attempts < (self.timeout - 3) and (not message or not len(seen_media)):
attempts += 1
time.sleep(1)
for post in self.client.iter_messages(chat, min_id=since_id):
since_id = max(since_id, post.id)
# Skip known filler message:
if post.message == 'The bot receives information through https://hikerapi.com/p/hJqpppqi':
continue
if post.media and post.id not in seen_media:
filename_dest = os.path.join(tmp_dir, f'{chat.id}_{post.id}')
media = self.client.download_media(post.media, filename_dest)
if media:
result.add_media(Media(media))
seen_media.append(post.id)
if post.message: message += post.message
return message.strip()

View File

@@ -1 +1 @@
from .telethon_extractor import TelethonArchiver
from .telethon_extractor import TelethonExtractor

View File

@@ -13,7 +13,7 @@ from auto_archiver.core import Metadata, Media
from auto_archiver.utils import random_str
class TelethonArchiver(Extractor):
class TelethonExtractor(Extractor):
valid_url = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")