mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Change archived_media to be timestamp for all scrapers
This commit is contained in:
11
app.py
11
app.py
@@ -11,7 +11,10 @@ from cisticola.base import Channel, mapper_registry
|
||||
from cisticola.scraper import (
|
||||
ScraperController,
|
||||
VkontakteScraper,
|
||||
TelegramTelethonScraper)
|
||||
TelegramTelethonScraper,
|
||||
GettrScraper,
|
||||
OdyseeScraper,
|
||||
RumbleScraper)
|
||||
|
||||
def sync_channels(args):
|
||||
logger.info("Synchronizing channels")
|
||||
@@ -78,7 +81,11 @@ def get_scraper_controller():
|
||||
|
||||
scrapers = [
|
||||
TelegramTelethonScraper(),
|
||||
VkontakteScraper()]
|
||||
VkontakteScraper(),
|
||||
GettrScraper(),
|
||||
OdyseeScraper(),
|
||||
RumbleScraper()
|
||||
]
|
||||
|
||||
controller.register_scrapers(scrapers)
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ class ScraperResult:
|
||||
date: datetime
|
||||
|
||||
#: JSON dump of dict that contains all data scraped for the post.
|
||||
raw_posts: str
|
||||
raw_data: str
|
||||
|
||||
#: Datetime (relative to UTC) that the scraped post was archived at.
|
||||
date_archived: datetime
|
||||
@@ -249,7 +249,7 @@ raw_posts_table = Table('raw_posts', mapper_registry.metadata,
|
||||
Column('channel', Integer, ForeignKey('channels.id')),
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('raw_posts', String),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON),
|
||||
Column('media_archived', DateTime))
|
||||
|
||||
@@ -257,7 +257,7 @@ class Scraper:
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
return result
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
@@ -402,6 +402,9 @@ class ScraperController:
|
||||
session.commit()
|
||||
added += 1
|
||||
|
||||
if added >= 200:
|
||||
break
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
|
||||
@@ -62,9 +62,9 @@ class BitchuteScraper(Scraper):
|
||||
platform_id=post['id'],
|
||||
date=datetime.fromtimestamp(post['timestamp']),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(post),
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Bitchute" and self.get_username_from_url(channel.url) is not None:
|
||||
|
||||
@@ -81,9 +81,9 @@ class GabScraper(Scraper):
|
||||
platform_id=post['id'],
|
||||
date=datetime.fromisoformat(post['created_at'].replace("Z", "+00:00")).replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(post),
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
def can_handle(self, channel: Channel) -> bool:
|
||||
if channel.platform == "Gab" and self.get_username_from_url(channel.url) is not None:
|
||||
|
||||
@@ -59,9 +59,9 @@ class GettrScraper(Scraper):
|
||||
platform_id=post['_id'],
|
||||
date=datetime.fromtimestamp(post['cdate']/1000.),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(post),
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gettr" and self.get_username_from_url(channel.url) is not None:
|
||||
|
||||
@@ -66,9 +66,9 @@ class InstagramScraper(Scraper):
|
||||
platform_id=post.mediaid,
|
||||
date=post.date_utc,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(post._asdict(), default=str),
|
||||
raw_data=json.dumps(post._asdict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
for comment in post.get_comments():
|
||||
|
||||
@@ -83,9 +83,9 @@ class InstagramScraper(Scraper):
|
||||
platform_id=post.mediaid,
|
||||
date=comment.created_at_utc,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(comment_dict, default=str),
|
||||
raw_data=json.dumps(comment_dict, default=str),
|
||||
archived_urls={},
|
||||
media_archived=True)
|
||||
media_archived=datetime.now(timezone.utc))
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Instagram" and self.get_username_from_url(channel.url) is not None:
|
||||
|
||||
@@ -62,9 +62,9 @@ class OdyseeScraper(Scraper):
|
||||
platform_id=video.info['claim_id'],
|
||||
date=datetime.fromtimestamp(video.info['created']),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(video.info),
|
||||
raw_data=json.dumps(video.info),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
for comment in all_comments:
|
||||
|
||||
@@ -75,9 +75,9 @@ class OdyseeScraper(Scraper):
|
||||
platform_id=comment.info['claim_id'],
|
||||
date=datetime.fromtimestamp(comment.info['created']),
|
||||
date_archived=datetime.now(),
|
||||
raw_posts=json.dumps(comment.info),
|
||||
raw_data=json.dumps(comment.info),
|
||||
archived_urls={},
|
||||
media_archived=True)
|
||||
media_archived=datetime.now(timezone.utc))
|
||||
|
||||
def archive_files(self, result: ScraperResult) -> ScraperResult:
|
||||
for url in result.archived_urls:
|
||||
@@ -91,7 +91,7 @@ class OdyseeScraper(Scraper):
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
return result
|
||||
|
||||
def can_handle(self, channel):
|
||||
|
||||
@@ -41,9 +41,9 @@ class RumbleScraper(Scraper):
|
||||
platform_id=post['media_url'].split('/')[-2],
|
||||
date=post['datetime'].replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(post, default = str),
|
||||
raw_data=json.dumps(post, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
@@ -57,7 +57,7 @@ class RumbleScraper(Scraper):
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
return result
|
||||
|
||||
def can_handle(self, channel):
|
||||
|
||||
@@ -50,9 +50,9 @@ class TelegramSnscrapeScraper(Scraper):
|
||||
platform_id=post.url,
|
||||
date=post.date,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=post.json(),
|
||||
raw_data=post.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None
|
||||
)
|
||||
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
@@ -44,7 +44,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
key = list(result.archived_urls.keys())[0]
|
||||
|
||||
if result.archived_urls[key] is None:
|
||||
raw = json.loads(result.raw_posts)
|
||||
raw = json.loads(result.raw_data)
|
||||
|
||||
message = client.get_messages(raw['peer_id']['channel_id'], ids=[raw['id']])
|
||||
|
||||
@@ -144,7 +144,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
platform_id=post_url,
|
||||
date=post.date.replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(post.to_dict(), default=str),
|
||||
raw_data=json.dumps(post.to_dict(), default=str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
|
||||
@@ -72,9 +72,9 @@ class TwitterScraper(Scraper):
|
||||
platform_id=tweet.id,
|
||||
date=tweet.date,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=tweet.json(),
|
||||
raw_data=tweet.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Twitter" and (channel.platform_id or channel.screenname):
|
||||
|
||||
@@ -69,7 +69,7 @@ class VkontakteScraper(Scraper):
|
||||
platform_id=post.url.split('/')[-1],
|
||||
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=post.json(),
|
||||
raw_data=post.json(),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
|
||||
@@ -75,9 +75,9 @@ class YoutubeScraper(Scraper):
|
||||
platform_id=video_id,
|
||||
date=datetime.strptime(video['upload_date'], '%Y%m%d').replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_posts=json.dumps(video, default = str),
|
||||
raw_data=json.dumps(video, default = str),
|
||||
archived_urls=archived_urls,
|
||||
media_archived=archive_media)
|
||||
media_archived=datetime.now(timezone.utc) if archive_media else None)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Youtube" and channel.url:
|
||||
@@ -115,7 +115,7 @@ class YoutubeScraper(Scraper):
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
result.archived_urls[url] = archived_url
|
||||
|
||||
result.media_archived = True
|
||||
result.media_archived = datetime.now(timezone.utc)
|
||||
return result
|
||||
|
||||
def get_profile(self, channel: Channel) -> RawChannelInfo:
|
||||
|
||||
@@ -20,7 +20,7 @@ class BitchuteTransformer(Transformer):
|
||||
return False
|
||||
|
||||
def transform_media(self, data: ScraperResult, transformed: Post) -> Generator[Media, None, None]:
|
||||
raw = json.loads(data.raw_posts)
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
orig = raw['video_url']
|
||||
new = data.archived_urls[orig]
|
||||
@@ -30,7 +30,7 @@ class BitchuteTransformer(Transformer):
|
||||
yield m
|
||||
|
||||
def transform(self, data: ScraperResult) -> Post:
|
||||
raw = json.loads(data.raw_posts)
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
soup = BeautifulSoup(raw['body'], features = 'html.parser')
|
||||
content = soup.find_all('p')[-1].text
|
||||
|
||||
@@ -47,7 +47,7 @@ class TwitterTransformer(Transformer):
|
||||
|
||||
|
||||
def transform(self, data: ScraperResult, insert: Callable) -> Generator[Union[Post, Channel, Media], None, None]:
|
||||
raw = json.loads(data.raw_posts)
|
||||
raw = json.loads(data.raw_data)
|
||||
|
||||
transformed = Post(
|
||||
raw_id=data.id,
|
||||
|
||||
Reference in New Issue
Block a user