mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
yield data rather than returning a list
This commit is contained in:
@@ -4,6 +4,7 @@ import cisticola.scraper.base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from loguru import logger
|
||||
|
||||
MAX_POSTS = 10
|
||||
|
||||
class ScraperController:
|
||||
"""Registers scrapers, uses them to generate ScraperResults. Synchronizes
|
||||
@@ -27,6 +28,10 @@ class ScraperController:
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.can_handle(channel):
|
||||
session = self.session()
|
||||
handled = True
|
||||
added = 0
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
rows = session.query(cisticola.base.ScraperResult).order_by(
|
||||
@@ -38,20 +43,20 @@ class ScraperController:
|
||||
since = None
|
||||
|
||||
posts = scraper.get_posts(channel, since=since)
|
||||
handled = True
|
||||
|
||||
for post in posts:
|
||||
session.add(post)
|
||||
added += 1
|
||||
if added >= MAX_POSTS:
|
||||
break
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {len(posts)} new posts from {channel}")
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
break
|
||||
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
else:
|
||||
session = self.session()
|
||||
session.bulk_save_objects(posts)
|
||||
session.commit()
|
||||
|
||||
logger.info(f"Added {len(posts)} entries to database")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
# create tables
|
||||
|
||||
@@ -13,16 +13,12 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
|
||||
return True
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None):
|
||||
posts = []
|
||||
scr = snscrape.modules.telegram.TelegramChannelScraper(
|
||||
channel.screenname)
|
||||
|
||||
g = scr.get_items()
|
||||
|
||||
for post in g:
|
||||
if (len(posts)) >= 10:
|
||||
break
|
||||
|
||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
break
|
||||
|
||||
@@ -36,7 +32,7 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
|
||||
video_archive_url = self.archive_media(post.video)
|
||||
archived_urls[post.video] = video_archive_url
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
yield cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Telegram",
|
||||
channel=channel.id,
|
||||
@@ -45,6 +41,4 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=post.json(),
|
||||
archived_urls=archived_urls
|
||||
))
|
||||
|
||||
return posts
|
||||
)
|
||||
|
||||
@@ -20,22 +20,19 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
|
||||
posts = []
|
||||
scraper = snscrape.modules.twitter.TwitterProfileScraper(
|
||||
TwitterScraper.get_username_from_url(channel.url))
|
||||
|
||||
first = True
|
||||
|
||||
for tweet in scraper.get_items():
|
||||
if len(posts) >= 10:
|
||||
break
|
||||
|
||||
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
|
||||
# with TwitterProfileScraper, the first tweet could be an old pinned tweet
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
else:
|
||||
print('too far')
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
@@ -58,7 +55,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
|
||||
archived_url = self.archive_media(url)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
yield cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Twitter",
|
||||
channel=channel.id,
|
||||
@@ -66,9 +63,7 @@ class TwitterScraper(cisticola.scraper.base.Scraper):
|
||||
date=tweet.date,
|
||||
date_archived=datetime.now(),
|
||||
raw_data=tweet.json(),
|
||||
archived_urls=archived_urls))
|
||||
|
||||
return posts
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None:
|
||||
|
||||
Reference in New Issue
Block a user