From 821c39004b8fbd429da0ffeb4eb81865f5bb3ee3 Mon Sep 17 00:00:00 2001 From: Tristan Lee Date: Thu, 10 Mar 2022 22:32:39 -0600 Subject: [PATCH] incorporated vkontakte scraper --- cisticola/scraper/__init__.py | 3 +- cisticola/scraper/base.py | 35 ++++++++++++++- cisticola/scraper/rumble.py | 45 +++---------------- cisticola/scraper/vkontakte.py | 80 ++++++++++++++++++++++++++++++++++ cisticola/transformer/base.py | 4 +- tests/conftest.py | 18 +++++++- tests/scraper/vkontakte.py | 16 +++++++ 7 files changed, 158 insertions(+), 43 deletions(-) create mode 100644 cisticola/scraper/vkontakte.py create mode 100644 tests/scraper/vkontakte.py diff --git a/cisticola/scraper/__init__.py b/cisticola/scraper/__init__.py index 4f33931..92a3e7a 100644 --- a/cisticola/scraper/__init__.py +++ b/cisticola/scraper/__init__.py @@ -7,4 +7,5 @@ from .odysee import OdyseeScraper from .rumble import RumbleScraper from .telegram_snscrape import TelegramSnscrapeScraper from .telegram_telethon import TelegramTelethonScraper -from .twitter import TwitterScraper \ No newline at end of file +from .twitter import TwitterScraper +from .vkontakte import VkontakteScraper \ No newline at end of file diff --git a/cisticola/scraper/base.py b/cisticola/scraper/base.py index f2eae25..9666faf 100644 --- a/cisticola/scraper/base.py +++ b/cisticola/scraper/base.py @@ -8,6 +8,7 @@ import boto3 from loguru import logger import ffmpeg from sqlalchemy.orm import sessionmaker +import youtube_dl from cisticola.base import Channel, ScraperResult, mapper_registry from cisticola.utils import make_request @@ -69,6 +70,38 @@ class Scraper: return blob, content_type, key + def youtubedl_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: + + content_type = 'video/mp4' + + with tempfile.TemporaryDirectory() as temp_dir: + ydl_opts = { + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "outtmpl": f"{temp_dir}/%(id)s.%(ext)s", + "noplaylist": True, + 'quiet': True, + "verbose": False,} + ydl = youtube_dl.YoutubeDL(ydl_opts) + + try: + meta = ydl.extract_info( + url, + download=True,) + except youtube_dl.utils.DownloadError as e: + raise e + else: + video_id = meta["id"] + video_ext = meta["ext"] + + with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f: + blob = f.read() + + if key is None: + key = self.url_to_key(url = url, content_type = content_type) + + return blob, content_type, key + def archive_blob(self, blob: bytes, content_type: str, key: str) -> str: filename = self.__version__.replace(' ', '_') + '/' + key @@ -101,7 +134,7 @@ class ScraperController: def register_scrapers(self, scraper: List[Scraper]): self.scrapers.extend(scraper) - @logger.catch + @logger.catch(reraise = True) def scrape_channels(self, channels: List[Channel], archive_media: bool = True): if self.session is None: logger.error("No DB session") diff --git a/cisticola/scraper/rumble.py b/cisticola/scraper/rumble.py index dbb4194..f0cdefb 100644 --- a/cisticola/scraper/rumble.py +++ b/cisticola/scraper/rumble.py @@ -1,12 +1,9 @@ from datetime import datetime, timezone import json -from typing import Generator, Tuple -import tempfile +from typing import Generator from urllib.parse import urlparse -import requests from bs4 import BeautifulSoup -import youtube_dl from cisticola.base import Channel, ScraperResult from cisticola.scraper import Scraper, make_request @@ -37,7 +34,7 @@ class RumbleScraper(Scraper): url = post['media_url'] - media_blob, content_type, key = self.url_to_blob(url) + media_blob, content_type, key = self.youtubedl_url_to_blob(url) archived_url = self.archive_blob(media_blob, content_type, key) archived_urls[post['media_url']] = archived_url @@ -51,43 +48,15 @@ class RumbleScraper(Scraper): raw_data=json.dumps(post), archived_urls=archived_urls) + def url_to_key(self, url: str, content_type: str) -> str: + ext = '.' + content_type.split('/')[-1] + key = urlparse(url).path.split('/')[-2] + ext + return key + def can_handle(self, channel): if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None: return True - def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]: - - content_type = 'video/mp4' - ext = '.' + content_type.split('/')[-1] - - with tempfile.TemporaryDirectory() as temp_dir: - ydl_opts = { - "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", - "merge_output_format": "mp4", - "outtmpl": f"{temp_dir}/%(id)s.%(ext)s", - "noplaylist": True, - 'quiet': True, - "verbose": False,} - ydl = youtube_dl.YoutubeDL(ydl_opts) - - try: - meta = ydl.extract_info( - url, - download=True,) - except youtube_dl.utils.DownloadError as e: - raise e - else: - video_id = meta["id"] - video_ext = meta["ext"] - - with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f: - blob = f.read() - - if key is None: - key = urlparse(url).path.split('/')[-2] + ext - - return blob, content_type, key - #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# def get_media_url(url): diff --git a/cisticola/scraper/vkontakte.py b/cisticola/scraper/vkontakte.py new file mode 100644 index 0000000..7535324 --- /dev/null +++ b/cisticola/scraper/vkontakte.py @@ -0,0 +1,80 @@ +from datetime import datetime, timezone +from typing import Generator +from urllib.parse import urlparse + +from snscrape.modules.vkontakte import VKontakteUserScraper +from loguru import logger + +from cisticola.base import Channel, ScraperResult +from cisticola.scraper.base import Scraper + +class VkontakteScraper(Scraper): + """An implementation of a Scraper for Vkontakte, using snscrape library""" + __version__ = "VkontakteScraper 0.0.1" + + def get_username_from_url(self, url): + username = url.split('https://vk.com/')[1] + + return username + + def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]: + + username = self.get_username_from_url(channel.url) + scraper = VKontakteUserScraper(username) + + first = True + + for post in scraper.get_items(): + if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc): + # with VKontakteUserScraper, the first tweet could be an old pinned tweet + if first: + first = False + continue + else: + break + + archived_urls = {} + + if archive_media: + + if post.photos: + + for photo in post.photos: + variant = max( + [v for v in photo.variants], key=lambda v: v.width * v.height) + url = variant.url + + if url is not None: + media_blob, content_type, key = self.url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url + + if post.video: + url = post.video.url + media_blob, content_type, key = self.youtubedl_url_to_blob(url) + archived_url = self.archive_blob(media_blob, content_type, key) + archived_urls[url] = archived_url + + yield ScraperResult( + scraper=self.__version__, + platform="Vkontatke", + channel=channel.id, + platform_id=post.url.split('/')[-1], + date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc), + date_archived=datetime.now(timezone.utc), + raw_data=post.json(), + archived_urls=archived_urls) + + def can_handle(self, channel): + if channel.platform == "Vkontakte" and channel.platform_id: + return True + + def url_to_key(self, url: str, content_type: str) -> str: + path = urlparse(url).path + if path.endswith('.jpg'): + key = '_'.join(path.split('/')[-2:]) + else: + ext = '.mp4' + key = path.split('/')[-1] + ext + + return key \ No newline at end of file diff --git a/cisticola/transformer/base.py b/cisticola/transformer/base.py index e320c4c..8e9eaa1 100644 --- a/cisticola/transformer/base.py +++ b/cisticola/transformer/base.py @@ -38,7 +38,7 @@ class ETLController: self.session = sessionmaker() self.session.configure(bind=engine) - @logger.catch + @logger.catch(reraise = True) def transform_results(self, results: List[ScraperResult], hydrate: bool = True): if self.session is None: logger.error("No DB session") @@ -76,7 +76,7 @@ class ETLController: if handled == False: logger.warning(f"No Transformer could handle {result}") - @logger.catch + @logger.catch(reraise = True) def transform_all_untransformed(self, hydrate: bool = True): if self.session is None: logger.error("No DB session") diff --git a/tests/conftest.py b/tests/conftest.py index 161439d..26bd92b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -111,6 +111,21 @@ TWITTER_CHANNEL_KWARGS = { 'chat': False, 'notes': ''} +VKONTAKTE_CHANNEL_KWARGS = { + 'id': 6, + 'name': 'Wwg1wgA (test)', + 'platform_id': 'club201278078', + 'category': 'test', + 'followers': None, + 'platform': 'Vkontakte', + 'url': 'https://vk.com/club201278078', + 'screenname': 'Wwg1wgA', + 'country': 'FR', + 'influencer': None, + 'public': True, + 'chat': False, + 'notes': ''} + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# @pytest.fixture(scope='package') @@ -142,6 +157,7 @@ def channel_kwargs(): 'odysee' : ODYSEE_CHANNEL_KWARGS, 'rumble' : RUMBLE_CHANNEL_KWARGS, 'telegram' : TELEGRAM_CHANNEL_KWARGS, - 'twitter' : TWITTER_CHANNEL_KWARGS} + 'twitter' : TWITTER_CHANNEL_KWARGS, + 'vkontakte' : VKONTAKTE_CHANNEL_KWARGS} #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++# \ No newline at end of file diff --git a/tests/scraper/vkontakte.py b/tests/scraper/vkontakte.py new file mode 100644 index 0000000..ef7cfa1 --- /dev/null +++ b/tests/scraper/vkontakte.py @@ -0,0 +1,16 @@ +from cisticola.base import Channel +from cisticola.scraper import VkontakteScraper + +def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs): + + channels = [Channel(**channel_kwargs['vkontakte'])] + controller.register_scraper(scraper = VkontakteScraper()) + controller.scrape_channels(channels = channels, archive_media = False) + +def test_scrape_vkontakte_channel(controller, channel_kwargs): + + controller.reset_db() + + channels = [Channel(**channel_kwargs['vkontakte'])] + controller.register_scraper(scraper = VkontakteScraper()) + controller.scrape_channels(channels = channels, archive_media = True)