mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
Merge pull request #2 from bellingcat/media
WIP: Archiving media, organization improvements
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -4,4 +4,4 @@
|
||||
*.db
|
||||
docs/build/
|
||||
docs/source/_*
|
||||
|
||||
.env
|
||||
|
||||
3
Pipfile
3
Pipfile
@@ -5,13 +5,14 @@ name = "pypi"
|
||||
|
||||
[packages]
|
||||
sqlalchemy = "*"
|
||||
snscrape = "*"
|
||||
loguru = "*"
|
||||
gogettr = "*"
|
||||
requests = "*"
|
||||
bs4 = "*"
|
||||
dateparser = "*"
|
||||
sphinx = "*"
|
||||
boto3 = "*"
|
||||
snscrape = {git = "https://github.com/bellingcat/snscrape.git"}
|
||||
|
||||
[dev-packages]
|
||||
|
||||
|
||||
42
Pipfile.lock
generated
42
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "cde7247f41da5501b9fc4fc5d01916548f719b3d4ea0f1dd1765c4cf0413bbf7"
|
||||
"sha256": "d3ee112521273c2b0b9df074b4eb9a20649a2854bfffa433171749019acf8561"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -39,6 +39,22 @@
|
||||
"markers": "python_version >= '3.1'",
|
||||
"version": "==4.10.0"
|
||||
},
|
||||
"boto3": {
|
||||
"hashes": [
|
||||
"sha256:0e8d4d814f94031947035a4c2bb2c23832d5de941a6a492fb85794a02bafc44d",
|
||||
"sha256:95d9b5b6fe3383fbf8f33d58f62258d3b3ea138d4369017031339b60fd5b8887"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.21.6"
|
||||
},
|
||||
"botocore": {
|
||||
"hashes": [
|
||||
"sha256:359b9ea3870a1f8264113cb0b1216baa94bf1e8cee5d5d8af63a2e7ca6e7b33c",
|
||||
"sha256:69aaa5a78ac7371f573e463be51fb962213c42fab08ef82380e84b9a87386949"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.24.6"
|
||||
},
|
||||
"bs4": {
|
||||
"hashes": [
|
||||
"sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"
|
||||
@@ -194,6 +210,14 @@
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.0.3"
|
||||
},
|
||||
"jmespath": {
|
||||
"hashes": [
|
||||
"sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9",
|
||||
"sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.0"
|
||||
},
|
||||
"loguru": {
|
||||
"hashes": [
|
||||
"sha256:066bd06758d0a513e9836fd9c6b5a75bfb3fd36841f4b996bc60b547a309d41c",
|
||||
@@ -460,6 +484,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==2.27.1"
|
||||
},
|
||||
"s3transfer": {
|
||||
"hashes": [
|
||||
"sha256:25c140f5c66aa79e1ac60be50dcd45ddc59e83895f062a3aab263b870102911f",
|
||||
"sha256:69d264d3e760e569b78aaa0f22c97e955891cd22e32b10c51f784eeda4d9d10a"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==0.5.1"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
||||
@@ -476,12 +508,8 @@
|
||||
"version": "==2.2.0"
|
||||
},
|
||||
"snscrape": {
|
||||
"hashes": [
|
||||
"sha256:af30d12872da692ff9ccaf5651962edceb1fd4a28cf7cc92c8c898902f009ce3",
|
||||
"sha256:fd176765196ca17979be7f54e041f430e4cb23a5e651fa29cf3dc382258019f2"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.4.3.20220106"
|
||||
"git": "https://github.com/bellingcat/snscrape.git",
|
||||
"ref": "de4ebed81f3f6a4bb4c65630daab6ec63784959b"
|
||||
},
|
||||
"soupsieve": {
|
||||
"hashes": [
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from typing import List
|
||||
import cisticola.scraper
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from loguru import logger
|
||||
|
||||
@@ -14,7 +14,7 @@ class ScraperController:
|
||||
self.session = None
|
||||
self.mapper_registry = None
|
||||
|
||||
def register_scraper(self, scraper: cisticola.scraper.Scraper):
|
||||
def register_scraper(self, scraper: cisticola.scraper.base.Scraper):
|
||||
self.scrapers.append(scraper)
|
||||
|
||||
def scrape_channels(self, channels: List[cisticola.base.Channel]):
|
||||
@@ -27,10 +27,15 @@ class ScraperController:
|
||||
|
||||
for scraper in self.scrapers:
|
||||
if scraper.can_handle(channel):
|
||||
session = self.session()
|
||||
handled = True
|
||||
added = 0
|
||||
|
||||
# get most recent post
|
||||
session = self.session()
|
||||
rows = session.query(cisticola.base.ScraperResult).order_by(
|
||||
cisticola.base.ScraperResult.date_archived).limit(1).all()
|
||||
rows = session.query(cisticola.base.ScraperResult).where(
|
||||
cisticola.base.ScraperResult.channel == channel.id).order_by(
|
||||
cisticola.base.ScraperResult.date.desc()).limit(1).all()
|
||||
|
||||
if len(rows) == 1:
|
||||
since = rows[0]
|
||||
@@ -38,21 +43,19 @@ class ScraperController:
|
||||
since = None
|
||||
|
||||
posts = scraper.get_posts(channel, since=since)
|
||||
handled = True
|
||||
|
||||
for post in posts:
|
||||
session.add(post)
|
||||
added += 1
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {len(posts)} new posts from {channel}")
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
break
|
||||
|
||||
if not handled:
|
||||
logger.warning(f"No handler found for Channel {channel}")
|
||||
|
||||
session = self.session()
|
||||
session.bulk_save_objects(posts)
|
||||
session.commit()
|
||||
|
||||
logger.info(f"Added {len(posts)} entries to database")
|
||||
|
||||
def connect_to_db(self, engine):
|
||||
# create tables
|
||||
cisticola.base.mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from sqlalchemy.orm import registry
|
||||
from sqlalchemy import Table, Column, Integer, String, DateTime, ForeignKey
|
||||
from sqlalchemy import Table, Column, Integer, String, JSON, DateTime, ForeignKey
|
||||
|
||||
mapper_registry = registry()
|
||||
|
||||
@@ -12,11 +12,12 @@ class ScraperResult:
|
||||
|
||||
scraper: str
|
||||
platform: str
|
||||
channel: int
|
||||
channel: int #TODO there is probably a way of making this a Channel object foreign key
|
||||
platform_id: str
|
||||
date: datetime
|
||||
raw_data: str
|
||||
date_archived: datetime
|
||||
archived_urls: dict
|
||||
|
||||
|
||||
raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
@@ -28,7 +29,8 @@ raw_data_table = Table('raw_data', mapper_registry.metadata,
|
||||
Column('platform_id', String),
|
||||
Column('date', DateTime),
|
||||
Column('raw_data', String),
|
||||
Column('date_archived', DateTime))
|
||||
Column('date_archived', DateTime),
|
||||
Column('archived_urls', JSON))
|
||||
|
||||
mapper_registry.map_imperatively(ScraperResult, raw_data_table)
|
||||
|
||||
@@ -42,6 +44,7 @@ class Channel:
|
||||
followers: int
|
||||
platform: str
|
||||
url: str
|
||||
screenname: str
|
||||
country: str
|
||||
influencer: str
|
||||
public: bool
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
from typing import List
|
||||
import cisticola.base
|
||||
|
||||
|
||||
class Scraper:
|
||||
__version__ = "Scraper 0.0.0"
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__version__
|
||||
|
||||
def can_handle(self, channel: cisticola.base.Channel) -> bool:
|
||||
pass
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
|
||||
pass
|
||||
|
||||
61
cisticola/scraper/base.py
Normal file
61
cisticola/scraper/base.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from typing import Generator
|
||||
import cisticola.base
|
||||
import requests
|
||||
import os
|
||||
import boto3
|
||||
from io import BytesIO
|
||||
from loguru import logger
|
||||
|
||||
class Scraper:
|
||||
__version__ = "Scraper 0.0.0"
|
||||
|
||||
def __init__(self):
|
||||
self.s3_client = boto3.client('s3',
|
||||
region_name=os.getenv(
|
||||
'DO_SPACES_REGION'),
|
||||
endpoint_url='https://{}.digitaloceanspaces.com'.format(
|
||||
os.getenv('DO_SPACES_REGION')),
|
||||
aws_access_key_id=os.getenv(
|
||||
'DO_SPACES_KEY'),
|
||||
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
|
||||
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return self.__version__
|
||||
|
||||
def archive_media(self, url: str, key: str = None) -> str:
|
||||
n_retries = 0
|
||||
r = requests.get(url)
|
||||
|
||||
while r.status_code != 200 and n_retries < 5:
|
||||
logger.warning(f"{n_retries}/5: Request for {url} failed")
|
||||
n_retries += 1
|
||||
r = requests.get(url)
|
||||
|
||||
if r.status_code != 200:
|
||||
logger.error(f"Could not fetch URL {url}")
|
||||
return url
|
||||
|
||||
blob = r.content
|
||||
|
||||
content_type = r.headers.get('Content-Type')
|
||||
|
||||
if key is None:
|
||||
key = url.split('/')[-1]
|
||||
key = key.split('?')[0]
|
||||
|
||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||
|
||||
self.s3_client.upload_fileobj(BytesIO(blob), Bucket=os.getenv(
|
||||
'DO_BUCKET'), Key=filename, ExtraArgs={'ACL': 'public-read', 'ContentType': content_type})
|
||||
|
||||
archived_url = os.getenv('DO_URL') + '/' + filename
|
||||
|
||||
return archived_url
|
||||
|
||||
def can_handle(self, channel: cisticola.base.Channel) -> bool:
|
||||
pass
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
pass
|
||||
@@ -4,7 +4,7 @@ import re
|
||||
from html.parser import HTMLParser
|
||||
import dateparser
|
||||
import json
|
||||
from typing import List
|
||||
from typing import Generator
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -23,7 +23,7 @@ class BitchuteScraper(cisticola.scraper.Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
|
||||
session = requests.Session()
|
||||
session.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import List
|
||||
from typing import Generator
|
||||
from gogettr import PublicClient
|
||||
|
||||
class GettrScraper(cisticola.scraper.Scraper):
|
||||
class GettrScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Gettr, using gogettr library"""
|
||||
__version__ = "GettrScraper 0.0.1"
|
||||
|
||||
@@ -15,26 +16,41 @@ class GettrScraper(cisticola.scraper.Scraper):
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
|
||||
posts = []
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
client = PublicClient()
|
||||
username = GettrScraper.get_username_from_url(channel.url)
|
||||
scraper = client.user_activity(username=username, type="posts")
|
||||
|
||||
for post in scraper:
|
||||
if since is not None and post['cdate'] <= int(since.date_archived.timestamp()):
|
||||
if since is not None and datetime.fromtimestamp(post['cdate']*0.001) <= since.date:
|
||||
break
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
archived_urls = {}
|
||||
|
||||
if 'imgs' in post:
|
||||
for img in post['imgs']:
|
||||
url = "https://media.gettr.com/" + img
|
||||
archived_url = self.archive_media(url)
|
||||
archived_urls[img] = archived_url
|
||||
|
||||
if 'main' in post:
|
||||
archived_url = self.archive_media("https://media.gettr.com/" + post['main'])
|
||||
archived_urls[post['main']] = archived_url
|
||||
|
||||
# TODO this is just archiving the playlist file, not the actual video
|
||||
if 'vid' in post:
|
||||
archived_url = self.archive_media("https://media.gettr.com/" + post['vid'])
|
||||
archived_urls[post['vid']] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Gettr",
|
||||
channel=username,
|
||||
channel=channel.id,
|
||||
platform_id=post['_id'],
|
||||
date=datetime.fromtimestamp(post['cdate']/1000.),
|
||||
date_archived=datetime.now(),
|
||||
raw_data=json.dumps(post)))
|
||||
|
||||
return posts
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Gettr" and GettrScraper.get_username_from_url(channel.url) is not None:
|
||||
|
||||
44
cisticola/scraper/telegram_snscrape.py
Normal file
44
cisticola/scraper/telegram_snscrape.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import cisticola.base
|
||||
import cisticola.scraper.base
|
||||
from typing import Generator
|
||||
import snscrape.modules
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
class TelegramSnscrapeScraper(cisticola.scraper.base.Scraper):
|
||||
__version__ = "TelegramSnscrapeScraper 0.0.1"
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Telegram" and channel.public and not channel.chat:
|
||||
return True
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
scr = snscrape.modules.telegram.TelegramChannelScraper(
|
||||
channel.screenname)
|
||||
|
||||
g = scr.get_items()
|
||||
|
||||
for post in g:
|
||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date.replace(tzinfo=timezone.utc):
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
for image_url in post.images:
|
||||
archive_url = self.archive_media(image_url)
|
||||
archived_urls[image_url] = archive_url
|
||||
|
||||
if post.video:
|
||||
video_archive_url = self.archive_media(post.video)
|
||||
archived_urls[post.video] = video_archive_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Telegram",
|
||||
channel=channel.id,
|
||||
platform_id=post.url,
|
||||
date=post.date,
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=post.json(),
|
||||
archived_urls=archived_urls
|
||||
)
|
||||
@@ -1,42 +1,59 @@
|
||||
import cisticola.base
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
import cisticola.scraper.base
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
import snscrape.modules
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class TwitterScraper(cisticola.scraper.Scraper):
|
||||
class TwitterScraper(cisticola.scraper.base.Scraper):
|
||||
"""An implementation of a Scraper for Twitter, using snscrape library"""
|
||||
__version__ = "TwitterScraper 0.0.1"
|
||||
|
||||
# TODO snscrape should be able to scrape from user ID alone, but there is
|
||||
# currently a bug/other issue, so it is extracting the username from URL
|
||||
def get_username_from_url(url):
|
||||
username = url.split("twitter.com/")[1]
|
||||
if len(username.split("/")) > 1:
|
||||
return None
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> Generator[cisticola.base.ScraperResult, None, None]:
|
||||
scraper = snscrape.modules.twitter.TwitterProfileScraper(channel.platform_id)
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: cisticola.base.Channel, since: cisticola.base.ScraperResult = None) -> List[cisticola.base.ScraperResult]:
|
||||
posts = []
|
||||
scraper = snscrape.modules.twitter.TwitterUserScraper(
|
||||
TwitterScraper.get_username_from_url(channel.url))
|
||||
first = True
|
||||
|
||||
for tweet in scraper.get_items():
|
||||
if since is not None and tweet.date.timestamp() <= since.date_archived.timestamp():
|
||||
break
|
||||
if since is not None and tweet.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
|
||||
# with TwitterProfileScraper, the first tweet could be an old pinned tweet
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
posts.append(cisticola.base.ScraperResult(
|
||||
archived_urls = {}
|
||||
|
||||
if tweet.media:
|
||||
for media in tweet.media:
|
||||
if type(media) == snscrape.modules.twitter.Video:
|
||||
variant = max(
|
||||
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
|
||||
url = variant.url
|
||||
elif type(media) == snscrape.modules.twitter.Gif:
|
||||
url = media.variants[0].url
|
||||
elif type(media) == snscrape.modules.twitter.Photo:
|
||||
url = media.fullUrl
|
||||
else:
|
||||
logger.warning(f"Could not get media URL of {media}")
|
||||
url = None
|
||||
|
||||
if url is not None:
|
||||
archived_url = self.archive_media(url)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield cisticola.base.ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Twitter",
|
||||
channel=channel.id,
|
||||
platform_id=tweet.id,
|
||||
date=tweet.date,
|
||||
date_archived=datetime.now(),
|
||||
raw_data=tweet.json()))
|
||||
|
||||
return posts
|
||||
raw_data=tweet.json(),
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Twitter" and TwitterScraper.get_username_from_url(channel.url) is not None:
|
||||
if channel.platform == "Twitter" and channel.platform_id:
|
||||
return True
|
||||
|
||||
29
test.py
29
test.py
@@ -1,38 +1,43 @@
|
||||
# TODO/TODECIDE:
|
||||
# should 'username' be a part of the Channel definition somehow?
|
||||
# still need to do some planning for handling media
|
||||
|
||||
import cisticola
|
||||
import cisticola.scraper.telegram_snscrape
|
||||
import cisticola.scraper.twitter
|
||||
import cisticola.scraper.gettr
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
|
||||
test_channels = [cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
|
||||
test_channels = [
|
||||
cisticola.base.Channel(id=0, name="Logan Williams (test)", platform_id=891729132,
|
||||
category="test", followers=None, platform="Twitter",
|
||||
url="https://twitter.com/obtusatum", country="US",
|
||||
url="https://twitter.com/obtusatum", screenname="obtusatum", country="US",
|
||||
influencer=None, public=True, chat=False,
|
||||
notes=""),
|
||||
cisticola.base.Channel(id=1, name="JQHN SPARTAN", platform_id=-1001181961026,
|
||||
category="qanon", followers=None, platform="Telegram",
|
||||
url="https://t.me/jqhnspartan", country="FR",
|
||||
url="https://t.me/jqhnspartan", screenname="jqhnspartan", country="FR",
|
||||
influencer="JQNH SPARTAN", public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(id=2, name="LizardRepublic", platform_id='lizardrepublic',
|
||||
category="qanon", followers=None, platform="Gettr",
|
||||
url="https://www.gettr.com/user/lizardrepublic", country="US",
|
||||
url="https://www.gettr.com/user/lizardrepublic", screenname="lizardrepublic", country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),
|
||||
cisticola.base.Channel(id=3, name="Patriot Front", platform_id='OVv9QZL4sEsC',
|
||||
category="nazi", followers=None, platform="Bitchute",
|
||||
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", country="US",
|
||||
url="https://www.bitchute.com/channel/OVv9QZL4sEsC/", screenname=None, country="US",
|
||||
influencer=None, public=True, chat=False, notes=""),]
|
||||
|
||||
|
||||
controller = cisticola.ScraperController()
|
||||
|
||||
scraper = cisticola.scraper.twitter.TwitterScraper()
|
||||
controller.register_scraper(scraper)
|
||||
twitter = cisticola.scraper.twitter.TwitterScraper()
|
||||
controller.register_scraper(twitter)
|
||||
|
||||
engine = create_engine('sqlite:///test.db')
|
||||
telegram = cisticola.scraper.telegram_snscrape.TelegramSnscrapeScraper()
|
||||
controller.register_scraper(telegram)
|
||||
|
||||
gettr = cisticola.scraper.gettr.GettrScraper()
|
||||
controller.register_scraper(gettr)
|
||||
|
||||
engine = create_engine('sqlite:///test3.db')
|
||||
controller.connect_to_db(engine)
|
||||
|
||||
controller.scrape_channels(test_channels)
|
||||
|
||||
Reference in New Issue
Block a user