mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-13 05:48:33 +03:00
Fix bug with Telethon scraper and certain media; add media_archived flag to TwitterScraper
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -11,6 +11,7 @@ docs/source/_*
|
|||||||
*.session
|
*.session
|
||||||
service_account.json
|
service_account.json
|
||||||
.vscode/
|
.vscode/
|
||||||
|
*.log
|
||||||
|
|
||||||
# Unit test / coverage reports
|
# Unit test / coverage reports
|
||||||
reports
|
reports
|
||||||
|
|||||||
1438
Pipfile.lock
generated
1438
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -66,13 +66,10 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
|
def archive_post_media(self, post : types.Message, client : TelegramClient = None):
|
||||||
logger.debug(f"Archiving post {post}")
|
|
||||||
|
|
||||||
if post.media is None:
|
if post.media is None:
|
||||||
|
logger.debug("No media for post")
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
logger.debug(f"Archiving media {post.media}")
|
|
||||||
|
|
||||||
if client is None:
|
if client is None:
|
||||||
api_id = os.environ['TELEGRAM_API_ID']
|
api_id = os.environ['TELEGRAM_API_ID']
|
||||||
api_hash = os.environ['TELEGRAM_API_HASH']
|
api_hash = os.environ['TELEGRAM_API_HASH']
|
||||||
@@ -81,6 +78,11 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
with TelegramClient(phone, api_id, api_hash) as client:
|
with TelegramClient(phone, api_id, api_hash) as client:
|
||||||
return self.archive_post_media(post, client=client)
|
return self.archive_post_media(post, client=client)
|
||||||
|
|
||||||
|
if type(post.media) == types.MessageMediaDocument:
|
||||||
|
logger.debug(f"Archiving {type(post.media)} with size {post.media.document.size/(1024*1024)} MB")
|
||||||
|
else:
|
||||||
|
logger.debug(f"Archiving {type(post.media)}")
|
||||||
|
|
||||||
key = f'{post.peer_id.channel_id}_{post.id}'
|
key = f'{post.peer_id.channel_id}_{post.id}'
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
@@ -88,6 +90,10 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
|
|
||||||
client.download_media(post.media, output_file)
|
client.download_media(post.media, output_file)
|
||||||
|
|
||||||
|
if len(os.listdir(temp_dir)) == 0:
|
||||||
|
logger.warning(f"No file present. Could not archive {post.media}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
output_file_with_ext = os.listdir(temp_dir)[0]
|
output_file_with_ext = os.listdir(temp_dir)[0]
|
||||||
filename = Path(temp_dir, output_file_with_ext)
|
filename = Path(temp_dir, output_file_with_ext)
|
||||||
|
|
||||||
@@ -96,7 +102,7 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
return (blob, output_file_with_ext)
|
return (blob, output_file_with_ext)
|
||||||
|
|
||||||
def can_handle(self, channel):
|
def can_handle(self, channel):
|
||||||
if channel.platform == "Telegram" and channel.public and not channel.chat:
|
if channel.platform == "Telegram" and channel.public:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||||
@@ -110,14 +116,13 @@ class TelegramTelethonScraper(Scraper):
|
|||||||
for post in client.iter_messages(username):
|
for post in client.iter_messages(username):
|
||||||
post_url = f'{channel.url}/{post.id}'
|
post_url = f'{channel.url}/{post.id}'
|
||||||
|
|
||||||
logger.info(f"Archiving post {post_url} from {post.date}")
|
logger.trace(f"Archiving post {post_url} from {post.date}")
|
||||||
|
|
||||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||||
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
|
logger.info(f'Timestamp of post {post} is earlier than the previous archived timestamp {post.date.replace(tzinfo=timezone.utc)}')
|
||||||
break
|
break
|
||||||
|
|
||||||
archived_urls = {}
|
archived_urls = {}
|
||||||
logger.info(f"Archiving post {post_url}")
|
|
||||||
|
|
||||||
if post.media is not None:
|
if post.media is not None:
|
||||||
archived_urls[post_url] = None
|
archived_urls[post_url] = None
|
||||||
|
|||||||
74
test.py
74
test.py
@@ -1,74 +0,0 @@
|
|||||||
from sqlalchemy import create_engine
|
|
||||||
from loguru import logger
|
|
||||||
import gspread
|
|
||||||
from sqlalchemy import create_engine
|
|
||||||
from sqlalchemy.orm import sessionmaker
|
|
||||||
|
|
||||||
from cisticola.base import Channel, Post, ScraperResult, mapper_registry
|
|
||||||
from cisticola.scraper import (
|
|
||||||
ScraperController,
|
|
||||||
BitchuteScraper,
|
|
||||||
GabScraper,
|
|
||||||
GettrScraper,
|
|
||||||
OdyseeScraper,
|
|
||||||
RumbleScraper,
|
|
||||||
TelegramSnscrapeScraper,
|
|
||||||
TelegramTelethonScraper,
|
|
||||||
TwitterScraper)
|
|
||||||
from cisticola.transformer import ETLController
|
|
||||||
from cisticola.transformer.twitter import TwitterTransformer
|
|
||||||
|
|
||||||
logger.add("../test.log")
|
|
||||||
|
|
||||||
controller = ScraperController()
|
|
||||||
|
|
||||||
scrapers = [
|
|
||||||
BitchuteScraper(),
|
|
||||||
GabScraper(),
|
|
||||||
GettrScraper(),
|
|
||||||
OdyseeScraper(),
|
|
||||||
RumbleScraper(),
|
|
||||||
TelegramTelethonScraper(),
|
|
||||||
TwitterScraper()]
|
|
||||||
|
|
||||||
controller.register_scrapers(scrapers)
|
|
||||||
|
|
||||||
engine = create_engine('sqlite:///test.db')
|
|
||||||
mapper_registry.metadata.create_all(bind=engine)
|
|
||||||
session_generator = sessionmaker()
|
|
||||||
session_generator.configure(bind=engine)
|
|
||||||
session = session_generator()
|
|
||||||
|
|
||||||
gc = gspread.service_account(filename='service_account.json')
|
|
||||||
|
|
||||||
# Open a sheet from a spreadsheet in one go
|
|
||||||
wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1k5VgqREoA3v1r7bkVq7TOTRDtdYqTMWkQnsZpRbntpw/edit#gid=0")
|
|
||||||
channels = wks.worksheet("channels").get_all_records()
|
|
||||||
|
|
||||||
for c in channels:
|
|
||||||
del c['followers']
|
|
||||||
|
|
||||||
for k in c.keys():
|
|
||||||
if c[k] == 'TRUE' or c[k] == 'yes': c[k] = True
|
|
||||||
if c[k] == 'FALSE' or c[k] == 'no': c[k] = False
|
|
||||||
|
|
||||||
# check to see if this already exists,
|
|
||||||
channel = session.query(Channel).filter_by(platform_id=c['platform_id'], platform=c['platform']).first()
|
|
||||||
|
|
||||||
if not channel:
|
|
||||||
channel = Channel(**c, source='researcher')
|
|
||||||
session.add(channel)
|
|
||||||
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
controller.connect_to_db(engine)
|
|
||||||
controller.scrape_all_channels(archive_media = False)
|
|
||||||
|
|
||||||
controller.archive_unarchived_media()
|
|
||||||
|
|
||||||
# transformer = TwitterTransformer()
|
|
||||||
|
|
||||||
# etl_controller = ETLController()
|
|
||||||
# etl_controller.register_transformer(transformer)
|
|
||||||
# etl_controller.connect_to_db(engine)
|
|
||||||
# etl_controller.transform_all_untransformed()
|
|
||||||
Reference in New Issue
Block a user