telethon join channels working

This commit is contained in:
msramalho
2022-12-14 14:01:39 +00:00
parent 955891a411
commit b3860cfec1
17 changed files with 539 additions and 295 deletions

View File

@@ -1,8 +1,11 @@
from __future__ import annotations
from ast import List
from typing import Union, Dict
from dataclasses import dataclass
from archivers.archiver import Archiverv2
from enrichers.enricher import Enricher
from metadata import Metadata
"""
how not to couple the different pieces of logic
@@ -108,12 +111,13 @@ Once an archiver returns a link to a local file (for eg to a storage), how do we
The context metadata should include a temporary folder (maybe a LocalStorage instance?)
"""
class ArchivingOrchestrator:
def __init__(self, config) -> None:
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
# identify each formatter, storage, database, etc
# self.feeder = Feeder.init(config.feeder, config.get(config.feeder))
# Is it possible to overwrite config.yaml values? it could be useful: share config file and modify gsheets_feeder.sheet via CLI
# where does that update/processing happen? in config.py
# reflection for Archiver to know wihch child classes it has? use Archiver.__subclasses__
@@ -123,7 +127,13 @@ class ArchivingOrchestrator:
# ]
self.feeder = config.feeder
self.enrichers = config.enrichers
self.archivers: List[Archiverv2] = config.archivers
for a in self.archivers: a.setup()
self.formatters = []
self.storages = []
self.databases = []
# self.formatters = [
# Formatter.init(f, config)
# for f in config.formatters
@@ -145,30 +155,33 @@ class ArchivingOrchestrator:
def feed(self) -> list(ArchiveResult):
for url in self.feeder:
print("ARCHIVING", url)
# self.archive(url)
self.archive(url)
# how does this handle the parameters like folder which can be different for each archiver?
# the storage needs to know where to archive!!
# solution: feeders have context: extra metadata that they can read or ignore,
# solution: feeders have context: extra metadata that they can read or ignore,
# all of it should have sensible defaults (eg: folder)
# default feeder is a list with 1 element
def archive(self, url) -> Union[ArchiveResult, None]:
url = clear_url(url)
result = Metadata(url=url)
# TODO:
# url = clear_url(url)
# result = Metadata(url=url)
result = Metadata()
result.set("url", url)
should_archive = True
for d in databases: should_archive &= d.should_process(url)
for d in self.databases: should_archive &= d.should_process(url)
# should storages also be able to check?
for s in storages: should_archive &= s.should_process(url)
for s in self.storages: should_archive &= s.should_process(url)
if not should_archive:
print("skipping")
return "skipping"
# signal to DB that archiving has started
for d in databases:
for d in self.databases:
# are the databases to decide whether to archive?
# they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
# they can simply return True by default, otherwise they can avoid duplicates. should this logic be more granular, for example on the archiver level: a tweet will not need be scraped twice, whereas an instagram profile might. the archiver could not decide from the link which parts to archive,
# instagram profile example: it would always re-archive everything
# maybe the database/storage could use a hash/key to decide if there's a need to re-archive
if d.should_process(url):
@@ -180,15 +193,15 @@ class ArchivingOrchestrator:
return
# vk, telethon, ...
for a in archivers:
for a in self.archivers:
# with automatic try/catch in download + archived (+ the other ops below)
# should the archivers come with the config already? are there configs which change at runtime?
# should the archivers come with the config already? are there configs which change at runtime?
# think not, so no need to pass config as parameter
# do they need to be refreshed with every execution?
# do they need to be refreshed with every execution?
# this is where the Hashes come from, the place with access to all content
# the archiver does not have access to storage
result.update(a.download(url))
if result.is_success(): break
result.merge(a.download(result))
if True or result.is_success(): break
# what if an archiver returns multiple entries and one is to be part of HTMLgenerator?
# should it call the HTMLgenerator as if it's not an enrichment?
@@ -196,20 +209,20 @@ class ArchivingOrchestrator:
# then how to execute it last? should there also be post-processors? are there other examples?
# maybe as a PDF? or a Markdown file
# side captures: screenshot, wacz, webarchive, thumbnails, HTMLgenerator
for e in enrichers:
result.update(e.enrich(result))
for e in self.enrichers:
result.merge(e.enrich(result))
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
for p in formatter:
result.update(p.process(result))
for f in self.formatters:
result.merge(f.format(result))
# storages
for s in storages:
for s in self.storages:
for m in result.media:
m.update(s.store(m))
m.merge(s.store(m))
# signal completion to databases (DBs, Google Sheets, CSV, ...)
# a hash registration service could be one database: forensic archiving
for d in databases: d.done( result)
for d in self.databases: d.done(result)
return result
return result