mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-11 12:58:33 +03:00
merged main
This commit is contained in:
1
Pipfile
1
Pipfile
@@ -25,6 +25,7 @@ gabber = {git = "https://github.com/stanfordio/gabber.git"}
|
||||
psycopg2-binary = "*"
|
||||
tqdm = "*"
|
||||
ratelimit = "*"
|
||||
pytz = "*"
|
||||
|
||||
[dev-packages]
|
||||
pytest = "*"
|
||||
|
||||
16
Pipfile.lock
generated
16
Pipfile.lock
generated
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "b9fc02f3ecaa2199480c4fcba30f02780860dfbc2e10c026889c78f639709fb4"
|
||||
"sha256": "89ac092ac8c8321f199f199da0c0867803a44b080538a43e1a57ae7713683616"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@@ -773,7 +773,7 @@
|
||||
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
|
||||
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
|
||||
],
|
||||
"markers": "python_version < '3.9'",
|
||||
"index": "pypi",
|
||||
"version": "==2022.1"
|
||||
},
|
||||
"pytz-deprecation-shim": {
|
||||
@@ -872,7 +872,9 @@
|
||||
"version": "==2022.3.2"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [],
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
@@ -1387,11 +1389,13 @@
|
||||
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
|
||||
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
|
||||
],
|
||||
"markers": "python_version < '3.9'",
|
||||
"index": "pypi",
|
||||
"version": "==2022.1"
|
||||
},
|
||||
"requests": {
|
||||
"extras": [],
|
||||
"extras": [
|
||||
"socks"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
|
||||
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
|
||||
@@ -1475,7 +1479,7 @@
|
||||
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
|
||||
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"markers": "python_full_version < '3.11.0'",
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"typing-extensions": {
|
||||
|
||||
5
app.py
5
app.py
@@ -5,6 +5,8 @@ from sqlalchemy import create_engine, func
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
import telethon.errors.rpcerrorlist
|
||||
|
||||
from cisticola.base import Channel, RawChannelInfo, mapper_registry
|
||||
from cisticola.scraper import (
|
||||
@@ -112,6 +114,8 @@ def init_db():
|
||||
mapper_registry.metadata.create_all(bind=engine)
|
||||
|
||||
if __name__ == '__main__':
|
||||
logger.remove()
|
||||
logger.add(sys.stdout, level="DEBUG", catch=True)
|
||||
logger.add("./test.log", level="TRACE")
|
||||
|
||||
parser = argparse.ArgumentParser(description = 'Cisticola command line tools')
|
||||
@@ -121,6 +125,7 @@ if __name__ == '__main__':
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
if args.command == 'init-db':
|
||||
init_db()
|
||||
elif args.command == 'sync-channels':
|
||||
|
||||
@@ -275,6 +275,7 @@ class Scraper:
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
"""Scrape all posts from the specified Channel.
|
||||
|
||||
@@ -341,7 +342,6 @@ class ScraperController:
|
||||
|
||||
return self.scrape_channel_info(channels)
|
||||
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||
"""Scrape all posts for all specified channels.
|
||||
|
||||
@@ -388,6 +388,9 @@ class ScraperController:
|
||||
session.add(post)
|
||||
added += 1
|
||||
|
||||
if added > 100:
|
||||
break
|
||||
|
||||
session.commit()
|
||||
logger.info(
|
||||
f"{scraper} found {added} new posts from {channel}")
|
||||
|
||||
@@ -8,6 +8,7 @@ from typing import Generator
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, RawChannelInfo
|
||||
from cisticola.scraper.base import Scraper
|
||||
@@ -22,6 +23,7 @@ class BitchuteScraper(Scraper):
|
||||
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
session = requests.Session()
|
||||
|
||||
@@ -2,6 +2,7 @@ from datetime import datetime, timezone, date
|
||||
import json
|
||||
from typing import Generator
|
||||
import os
|
||||
from loguru import logger
|
||||
|
||||
from gabber.client import Client, GAB_API_BASE_URL
|
||||
|
||||
@@ -22,6 +23,7 @@ class GabScraper(Scraper):
|
||||
|
||||
return group_id
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = Client(
|
||||
username = os.environ['GAB_USER'],
|
||||
|
||||
@@ -2,6 +2,7 @@ from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
|
||||
from gogettr import PublicClient
|
||||
|
||||
@@ -19,6 +20,7 @@ class GettrScraper(Scraper):
|
||||
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
client = PublicClient()
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
@@ -25,6 +25,7 @@ class InstagramScraper(Scraper):
|
||||
username = url.split(BASE_URL)[1].strip('/')
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
@@ -25,6 +25,7 @@ class OdyseeScraper(Scraper):
|
||||
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
@@ -2,6 +2,7 @@ from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -14,6 +15,7 @@ class RumbleScraper(Scraper):
|
||||
"""An implementation of a Scraper for Rumble, using custom functions"""
|
||||
__version__ = "RumbleScraper 0.0.1"
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
scraper = get_channel_videos(channel.url)
|
||||
|
||||
@@ -15,6 +15,7 @@ class TelegramSnscrapeScraper(Scraper):
|
||||
if channel.platform == "Telegram" and channel.public and not channel.chat:
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
scr = snscrape.modules.telegram.TelegramChannelScraper(
|
||||
channel.screenname)
|
||||
|
||||
@@ -105,6 +105,7 @@ class TelegramTelethonScraper(Scraper):
|
||||
if channel.platform == "Telegram" and channel.public:
|
||||
return True
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
username = channel.screenname
|
||||
if username is None:
|
||||
|
||||
@@ -12,6 +12,7 @@ class TwitterScraper(Scraper):
|
||||
"""An implementation of a Scraper for Twitter, using snscrape library"""
|
||||
__version__ = "TwitterScraper 0.0.1"
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
if channel.platform_id:
|
||||
identifier = int(channel.platform_id)
|
||||
|
||||
@@ -20,6 +20,7 @@ class VkontakteScraper(Scraper):
|
||||
|
||||
return username
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
|
||||
@@ -15,6 +15,7 @@ class YoutubeScraper(Scraper):
|
||||
"""An implementation of a Scraper for Youtube, using youtube-dl"""
|
||||
__version__ = "YoutubeScraper 0.0.1"
|
||||
|
||||
@logger.catch
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
content_type = 'video/mp4'
|
||||
|
||||
Reference in New Issue
Block a user