merged main

This commit is contained in:
Tristan Lee
2022-04-01 02:05:25 -05:00
15 changed files with 35 additions and 7 deletions

View File

@@ -25,6 +25,7 @@ gabber = {git = "https://github.com/stanfordio/gabber.git"}
psycopg2-binary = "*"
tqdm = "*"
ratelimit = "*"
pytz = "*"
[dev-packages]
pytest = "*"

16
Pipfile.lock generated
View File

@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "b9fc02f3ecaa2199480c4fcba30f02780860dfbc2e10c026889c78f639709fb4"
"sha256": "89ac092ac8c8321f199f199da0c0867803a44b080538a43e1a57ae7713683616"
},
"pipfile-spec": 6,
"requires": {
@@ -773,7 +773,7 @@
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
],
"markers": "python_version < '3.9'",
"index": "pypi",
"version": "==2022.1"
},
"pytz-deprecation-shim": {
@@ -872,7 +872,9 @@
"version": "==2022.3.2"
},
"requests": {
"extras": [],
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -1387,11 +1389,13 @@
"sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7",
"sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"
],
"markers": "python_version < '3.9'",
"index": "pypi",
"version": "==2022.1"
},
"requests": {
"extras": [],
"extras": [
"socks"
],
"hashes": [
"sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61",
"sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"
@@ -1475,7 +1479,7 @@
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
],
"markers": "python_version >= '3.7'",
"markers": "python_full_version < '3.11.0'",
"version": "==2.0.1"
},
"typing-extensions": {

5
app.py
View File

@@ -5,6 +5,8 @@ from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker
import os
import time
import sys
import telethon.errors.rpcerrorlist
from cisticola.base import Channel, RawChannelInfo, mapper_registry
from cisticola.scraper import (
@@ -112,6 +114,8 @@ def init_db():
mapper_registry.metadata.create_all(bind=engine)
if __name__ == '__main__':
logger.remove()
logger.add(sys.stdout, level="DEBUG", catch=True)
logger.add("./test.log", level="TRACE")
parser = argparse.ArgumentParser(description = 'Cisticola command line tools')
@@ -121,6 +125,7 @@ if __name__ == '__main__':
args = parser.parse_args()
if args.command == 'init-db':
init_db()
elif args.command == 'sync-channels':

View File

@@ -275,6 +275,7 @@ class Scraper:
raise NotImplementedError
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
"""Scrape all posts from the specified Channel.
@@ -341,7 +342,6 @@ class ScraperController:
return self.scrape_channel_info(channels)
@logger.catch(reraise = True)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
"""Scrape all posts for all specified channels.
@@ -388,6 +388,9 @@ class ScraperController:
session.add(post)
added += 1
if added > 100:
break
session.commit()
logger.info(
f"{scraper} found {added} new posts from {channel}")

View File

@@ -8,6 +8,7 @@ from typing import Generator
import requests
from bs4 import BeautifulSoup
from loguru import logger
from cisticola.base import Channel, ScraperResult, RawChannelInfo
from cisticola.scraper.base import Scraper
@@ -22,6 +23,7 @@ class BitchuteScraper(Scraper):
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
session = requests.Session()

View File

@@ -2,6 +2,7 @@ from datetime import datetime, timezone, date
import json
from typing import Generator
import os
from loguru import logger
from gabber.client import Client, GAB_API_BASE_URL
@@ -22,6 +23,7 @@ class GabScraper(Scraper):
return group_id
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = Client(
username = os.environ['GAB_USER'],

View File

@@ -2,6 +2,7 @@ from datetime import datetime, timezone
import json
from typing import Generator
from urllib.parse import urlparse
from loguru import logger
from gogettr import PublicClient
@@ -19,6 +20,7 @@ class GettrScraper(Scraper):
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
client = PublicClient()
username = self.get_username_from_url(channel.url)

View File

@@ -25,6 +25,7 @@ class InstagramScraper(Scraper):
username = url.split(BASE_URL)[1].strip('/')
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)

View File

@@ -25,6 +25,7 @@ class OdyseeScraper(Scraper):
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)

View File

@@ -2,6 +2,7 @@ from datetime import datetime, timezone
import json
from typing import Generator
from urllib.parse import urlparse
from loguru import logger
from bs4 import BeautifulSoup
@@ -14,6 +15,7 @@ class RumbleScraper(Scraper):
"""An implementation of a Scraper for Rumble, using custom functions"""
__version__ = "RumbleScraper 0.0.1"
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
scraper = get_channel_videos(channel.url)

View File

@@ -15,6 +15,7 @@ class TelegramSnscrapeScraper(Scraper):
if channel.platform == "Telegram" and channel.public and not channel.chat:
return True
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
scr = snscrape.modules.telegram.TelegramChannelScraper(
channel.screenname)

View File

@@ -105,6 +105,7 @@ class TelegramTelethonScraper(Scraper):
if channel.platform == "Telegram" and channel.public:
return True
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = channel.screenname
if username is None:

View File

@@ -12,6 +12,7 @@ class TwitterScraper(Scraper):
"""An implementation of a Scraper for Twitter, using snscrape library"""
__version__ = "TwitterScraper 0.0.1"
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
if channel.platform_id:
identifier = int(channel.platform_id)

View File

@@ -20,6 +20,7 @@ class VkontakteScraper(Scraper):
return username
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)

View File

@@ -15,6 +15,7 @@ class YoutubeScraper(Scraper):
"""An implementation of a Scraper for Youtube, using youtube-dl"""
__version__ = "YoutubeScraper 0.0.1"
@logger.catch
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
content_type = 'video/mp4'