mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-12 13:28:34 +03:00
Archive media in dictionary
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
@@ -17,6 +17,7 @@ class ScraperResult:
|
||||
date: datetime
|
||||
raw_data: str
|
||||
date_archived: datetime
|
||||
archived_urls: dict
|
||||
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
@@ -28,7 +29,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime))
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON))
|
||||
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ class Scraper:
|
||||
def __str__(self):
|
||||
return self.__version__
|
||||
|
||||
def archive_media(self, url: str) -> str:
|
||||
def archive_media(self, url: str, key: str = None) -> str:
|
||||
n_retries = 0
|
||||
r = requests.get(url)
|
||||
|
||||
@@ -39,8 +39,9 @@ class Scraper:
|
||||
|
||||
blob = r.content
|
||||
|
||||
key = url.split('/')[-1]
|
||||
key = key.split('?')[0]
|
||||
if key is None:
|
||||
key = url.split('/')[-1]
|
||||
key = key.split('?')[0]
|
||||
|
||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from typing import List
|
||||
@@ -24,15 +23,15 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
|
||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
break
|
||||
|
||||
raw_data = post.json()
|
||||
archived_urls = {}
|
||||
|
||||
for image_url in post.images:
|
||||
archive_url = self.archive_media(image_url)
|
||||
raw_data = raw_data.replace(image_url, archive_url)
|
||||
archived_urls[image_url] = archive_url
|
||||
|
||||
if post.video:
|
||||
video_archive_url = self.archive_media(post.video)
|
||||
raw_data = raw_data.replace(post.video, video_archive_url)
|
||||
archived_urls[post.video] = video_archive_url
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
@@ -40,8 +39,9 @@ class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
|
||||
channel=channel.id,
|
||||
platform_id=post.url,
|
||||
date=post.date,
|
||||
date_archived=datetime.now(),
|
||||
raw_data=raw_data
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=post.json(),
|
||||
archived_urls=archived_urls
|
||||
))
|
||||
|
||||
return posts
|
||||
|
||||
6
test.py
6
test.py
@@ -1,7 +1,3 @@
|
||||
# TODO/TODECIDE:
|
||||
# should 'username' be a part of the Channel definition somehow?
|
||||
# still need to do some planning for handling media
|
||||
|
||||
import cisticola
|
||||
import cisticola.scraper.telegram_snscrape
|
||||
|
||||
@@ -35,7 +31,7 @@ controller = cisticola.ScraperController()
|
||||
scraper = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper()
|
||||
controller.register_scraper(scraper)
|
||||
|
||||
engine = create_engine('sqlite:///test3.db')
|
||||
engine = create_engine('sqlite:///test4.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels)
|
||||
|
||||
Reference in New Issue
Block a user