incorporated vkontakte scraper

This commit is contained in:
Tristan Lee
2022-03-10 22:32:39 -06:00
parent 3d919316a9
commit 821c39004b
7 changed files with 158 additions and 43 deletions

View File

@@ -7,4 +7,5 @@ from .odysee import OdyseeScraper
from .rumble import RumbleScraper
from .telegram_snscrape import TelegramSnscrapeScraper
from .telegram_telethon import TelegramTelethonScraper
from .twitter import TwitterScraper
from .twitter import TwitterScraper
from .vkontakte import VkontakteScraper

View File

@@ -8,6 +8,7 @@ import boto3
from loguru import logger
import ffmpeg
from sqlalchemy.orm import sessionmaker
import youtube_dl
from cisticola.base import Channel, ScraperResult, mapper_registry
from cisticola.utils import make_request
@@ -69,6 +70,38 @@ class Scraper:
return blob, content_type, key
def youtubedl_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
content_type = 'video/mp4'
with tempfile.TemporaryDirectory() as temp_dir:
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"noplaylist": True,
'quiet': True,
"verbose": False,}
ydl = youtube_dl.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
url,
download=True,)
except youtube_dl.utils.DownloadError as e:
raise e
else:
video_id = meta["id"]
video_ext = meta["ext"]
with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
blob = f.read()
if key is None:
key = self.url_to_key(url = url, content_type = content_type)
return blob, content_type, key
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
filename = self.__version__.replace(' ', '_') + '/' + key
@@ -101,7 +134,7 @@ class ScraperController:
def register_scrapers(self, scraper: List[Scraper]):
self.scrapers.extend(scraper)
@logger.catch
@logger.catch(reraise = True)
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
if self.session is None:
logger.error("No DB session")

View File

@@ -1,12 +1,9 @@
from datetime import datetime, timezone
import json
from typing import Generator, Tuple
import tempfile
from typing import Generator
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
import youtube_dl
from cisticola.base import Channel, ScraperResult
from cisticola.scraper import Scraper, make_request
@@ -37,7 +34,7 @@ class RumbleScraper(Scraper):
url = post['media_url']
media_blob, content_type, key = self.url_to_blob(url)
media_blob, content_type, key = self.youtubedl_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[post['media_url']] = archived_url
@@ -51,43 +48,15 @@ class RumbleScraper(Scraper):
raw_data=json.dumps(post),
archived_urls=archived_urls)
def url_to_key(self, url: str, content_type: str) -> str:
ext = '.' + content_type.split('/')[-1]
key = urlparse(url).path.split('/')[-2] + ext
return key
def can_handle(self, channel):
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
return True
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
content_type = 'video/mp4'
ext = '.' + content_type.split('/')[-1]
with tempfile.TemporaryDirectory() as temp_dir:
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"merge_output_format": "mp4",
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
"noplaylist": True,
'quiet': True,
"verbose": False,}
ydl = youtube_dl.YoutubeDL(ydl_opts)
try:
meta = ydl.extract_info(
url,
download=True,)
except youtube_dl.utils.DownloadError as e:
raise e
else:
video_id = meta["id"]
video_ext = meta["ext"]
with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
blob = f.read()
if key is None:
key = urlparse(url).path.split('/')[-2] + ext
return blob, content_type, key
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
def get_media_url(url):

View File

@@ -0,0 +1,80 @@
from datetime import datetime, timezone
from typing import Generator
from urllib.parse import urlparse
from snscrape.modules.vkontakte import VKontakteUserScraper
from loguru import logger
from cisticola.base import Channel, ScraperResult
from cisticola.scraper.base import Scraper
class VkontakteScraper(Scraper):
"""An implementation of a Scraper for Vkontakte, using snscrape library"""
__version__ = "VkontakteScraper 0.0.1"
def get_username_from_url(self, url):
username = url.split('https://vk.com/')[1]
return username
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
username = self.get_username_from_url(channel.url)
scraper = VKontakteUserScraper(username)
first = True
for post in scraper.get_items():
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
# with VKontakteUserScraper, the first tweet could be an old pinned tweet
if first:
first = False
continue
else:
break
archived_urls = {}
if archive_media:
if post.photos:
for photo in post.photos:
variant = max(
[v for v in photo.variants], key=lambda v: v.width * v.height)
url = variant.url
if url is not None:
media_blob, content_type, key = self.url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
if post.video:
url = post.video.url
media_blob, content_type, key = self.youtubedl_url_to_blob(url)
archived_url = self.archive_blob(media_blob, content_type, key)
archived_urls[url] = archived_url
yield ScraperResult(
scraper=self.__version__,
platform="Vkontatke",
channel=channel.id,
platform_id=post.url.split('/')[-1],
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
date_archived=datetime.now(timezone.utc),
raw_data=post.json(),
archived_urls=archived_urls)
def can_handle(self, channel):
if channel.platform == "Vkontakte" and channel.platform_id:
return True
def url_to_key(self, url: str, content_type: str) -> str:
path = urlparse(url).path
if path.endswith('.jpg'):
key = '_'.join(path.split('/')[-2:])
else:
ext = '.mp4'
key = path.split('/')[-1] + ext
return key

View File

@@ -38,7 +38,7 @@ class ETLController:
self.session = sessionmaker()
self.session.configure(bind=engine)
@logger.catch
@logger.catch(reraise = True)
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
if self.session is None:
logger.error("No DB session")
@@ -76,7 +76,7 @@ class ETLController:
if handled == False:
logger.warning(f"No Transformer could handle {result}")
@logger.catch
@logger.catch(reraise = True)
def transform_all_untransformed(self, hydrate: bool = True):
if self.session is None:
logger.error("No DB session")

View File

@@ -111,6 +111,21 @@ TWITTER_CHANNEL_KWARGS = {
'chat': False,
'notes': ''}
VKONTAKTE_CHANNEL_KWARGS = {
'id': 6,
'name': 'Wwg1wgA (test)',
'platform_id': 'club201278078',
'category': 'test',
'followers': None,
'platform': 'Vkontakte',
'url': 'https://vk.com/club201278078',
'screenname': 'Wwg1wgA',
'country': 'FR',
'influencer': None,
'public': True,
'chat': False,
'notes': ''}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
@pytest.fixture(scope='package')
@@ -142,6 +157,7 @@ def channel_kwargs():
'odysee' : ODYSEE_CHANNEL_KWARGS,
'rumble' : RUMBLE_CHANNEL_KWARGS,
'telegram' : TELEGRAM_CHANNEL_KWARGS,
'twitter' : TWITTER_CHANNEL_KWARGS}
'twitter' : TWITTER_CHANNEL_KWARGS,
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS}
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

View File

@@ -0,0 +1,16 @@
from cisticola.base import Channel
from cisticola.scraper import VkontakteScraper
def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs):
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = False)
def test_scrape_vkontakte_channel(controller, channel_kwargs):
controller.reset_db()
channels = [Channel(**channel_kwargs['vkontakte'])]
controller.register_scraper(scraper = VkontakteScraper())
controller.scrape_channels(channels = channels, archive_media = True)