mirror of
https://github.com/bellingcat/cisticola.git
synced 2026-06-08 03:18:34 +03:00
incorporated vkontakte scraper
This commit is contained in:
@@ -7,4 +7,5 @@ from .odysee import OdyseeScraper
|
||||
from .rumble import RumbleScraper
|
||||
from .telegram_snscrape import TelegramSnscrapeScraper
|
||||
from .telegram_telethon import TelegramTelethonScraper
|
||||
from .twitter import TwitterScraper
|
||||
from .twitter import TwitterScraper
|
||||
from .vkontakte import VkontakteScraper
|
||||
@@ -8,6 +8,7 @@ import boto3
|
||||
from loguru import logger
|
||||
import ffmpeg
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import youtube_dl
|
||||
|
||||
from cisticola.base import Channel, ScraperResult, mapper_registry
|
||||
from cisticola.utils import make_request
|
||||
@@ -69,6 +70,38 @@ class Scraper:
|
||||
|
||||
return blob, content_type, key
|
||||
|
||||
def youtubedl_url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
content_type = 'video/mp4'
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
ydl_opts = {
|
||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||
"merge_output_format": "mp4",
|
||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
||||
"noplaylist": True,
|
||||
'quiet': True,
|
||||
"verbose": False,}
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
|
||||
try:
|
||||
meta = ydl.extract_info(
|
||||
url,
|
||||
download=True,)
|
||||
except youtube_dl.utils.DownloadError as e:
|
||||
raise e
|
||||
else:
|
||||
video_id = meta["id"]
|
||||
video_ext = meta["ext"]
|
||||
|
||||
with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
|
||||
blob = f.read()
|
||||
|
||||
if key is None:
|
||||
key = self.url_to_key(url = url, content_type = content_type)
|
||||
|
||||
return blob, content_type, key
|
||||
|
||||
def archive_blob(self, blob: bytes, content_type: str, key: str) -> str:
|
||||
|
||||
filename = self.__version__.replace(' ', '_') + '/' + key
|
||||
@@ -101,7 +134,7 @@ class ScraperController:
|
||||
def register_scrapers(self, scraper: List[Scraper]):
|
||||
self.scrapers.extend(scraper)
|
||||
|
||||
@logger.catch
|
||||
@logger.catch(reraise = True)
|
||||
def scrape_channels(self, channels: List[Channel], archive_media: bool = True):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
from datetime import datetime, timezone
|
||||
import json
|
||||
from typing import Generator, Tuple
|
||||
import tempfile
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import youtube_dl
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper import Scraper, make_request
|
||||
@@ -37,7 +34,7 @@ class RumbleScraper(Scraper):
|
||||
|
||||
url = post['media_url']
|
||||
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
media_blob, content_type, key = self.youtubedl_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[post['media_url']] = archived_url
|
||||
|
||||
@@ -51,43 +48,15 @@ class RumbleScraper(Scraper):
|
||||
raw_data=json.dumps(post),
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
key = urlparse(url).path.split('/')[-2] + ext
|
||||
return key
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Rumble" and RumbleScraper.get_username_from_url(channel.url) is not None:
|
||||
return True
|
||||
|
||||
def url_to_blob(self, url: str, key: str = None) -> Tuple[bytes, str, str]:
|
||||
|
||||
content_type = 'video/mp4'
|
||||
ext = '.' + content_type.split('/')[-1]
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
ydl_opts = {
|
||||
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
|
||||
"merge_output_format": "mp4",
|
||||
"outtmpl": f"{temp_dir}/%(id)s.%(ext)s",
|
||||
"noplaylist": True,
|
||||
'quiet': True,
|
||||
"verbose": False,}
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
|
||||
try:
|
||||
meta = ydl.extract_info(
|
||||
url,
|
||||
download=True,)
|
||||
except youtube_dl.utils.DownloadError as e:
|
||||
raise e
|
||||
else:
|
||||
video_id = meta["id"]
|
||||
video_ext = meta["ext"]
|
||||
|
||||
with open(f"{temp_dir}/{video_id}.{video_ext}", "rb") as f:
|
||||
blob = f.read()
|
||||
|
||||
if key is None:
|
||||
key = urlparse(url).path.split('/')[-2] + ext
|
||||
|
||||
return blob, content_type, key
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def get_media_url(url):
|
||||
|
||||
80
cisticola/scraper/vkontakte.py
Normal file
80
cisticola/scraper/vkontakte.py
Normal file
@@ -0,0 +1,80 @@
|
||||
from datetime import datetime, timezone
|
||||
from typing import Generator
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from snscrape.modules.vkontakte import VKontakteUserScraper
|
||||
from loguru import logger
|
||||
|
||||
from cisticola.base import Channel, ScraperResult
|
||||
from cisticola.scraper.base import Scraper
|
||||
|
||||
class VkontakteScraper(Scraper):
|
||||
"""An implementation of a Scraper for Vkontakte, using snscrape library"""
|
||||
__version__ = "VkontakteScraper 0.0.1"
|
||||
|
||||
def get_username_from_url(self, url):
|
||||
username = url.split('https://vk.com/')[1]
|
||||
|
||||
return username
|
||||
|
||||
def get_posts(self, channel: Channel, since: ScraperResult = None, archive_media: bool = True) -> Generator[ScraperResult, None, None]:
|
||||
|
||||
username = self.get_username_from_url(channel.url)
|
||||
scraper = VKontakteUserScraper(username)
|
||||
|
||||
first = True
|
||||
|
||||
for post in scraper.get_items():
|
||||
if since is not None and post.date.replace(tzinfo=timezone.utc) <= since.date_archived.replace(tzinfo=timezone.utc):
|
||||
# with VKontakteUserScraper, the first tweet could be an old pinned tweet
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
archived_urls = {}
|
||||
|
||||
if archive_media:
|
||||
|
||||
if post.photos:
|
||||
|
||||
for photo in post.photos:
|
||||
variant = max(
|
||||
[v for v in photo.variants], key=lambda v: v.width * v.height)
|
||||
url = variant.url
|
||||
|
||||
if url is not None:
|
||||
media_blob, content_type, key = self.url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
if post.video:
|
||||
url = post.video.url
|
||||
media_blob, content_type, key = self.youtubedl_url_to_blob(url)
|
||||
archived_url = self.archive_blob(media_blob, content_type, key)
|
||||
archived_urls[url] = archived_url
|
||||
|
||||
yield ScraperResult(
|
||||
scraper=self.__version__,
|
||||
platform="Vkontatke",
|
||||
channel=channel.id,
|
||||
platform_id=post.url.split('/')[-1],
|
||||
date=datetime.fromordinal(post.date.toordinal()).replace(tzinfo=timezone.utc),
|
||||
date_archived=datetime.now(timezone.utc),
|
||||
raw_data=post.json(),
|
||||
archived_urls=archived_urls)
|
||||
|
||||
def can_handle(self, channel):
|
||||
if channel.platform == "Vkontakte" and channel.platform_id:
|
||||
return True
|
||||
|
||||
def url_to_key(self, url: str, content_type: str) -> str:
|
||||
path = urlparse(url).path
|
||||
if path.endswith('.jpg'):
|
||||
key = '_'.join(path.split('/')[-2:])
|
||||
else:
|
||||
ext = '.mp4'
|
||||
key = path.split('/')[-1] + ext
|
||||
|
||||
return key
|
||||
@@ -38,7 +38,7 @@ class ETLController:
|
||||
self.session = sessionmaker()
|
||||
self.session.configure(bind=engine)
|
||||
|
||||
@logger.catch
|
||||
@logger.catch(reraise = True)
|
||||
def transform_results(self, results: List[ScraperResult], hydrate: bool = True):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
@@ -76,7 +76,7 @@ class ETLController:
|
||||
if handled == False:
|
||||
logger.warning(f"No Transformer could handle {result}")
|
||||
|
||||
@logger.catch
|
||||
@logger.catch(reraise = True)
|
||||
def transform_all_untransformed(self, hydrate: bool = True):
|
||||
if self.session is None:
|
||||
logger.error("No DB session")
|
||||
|
||||
@@ -111,6 +111,21 @@ TWITTER_CHANNEL_KWARGS = {
|
||||
'chat': False,
|
||||
'notes': ''}
|
||||
|
||||
VKONTAKTE_CHANNEL_KWARGS = {
|
||||
'id': 6,
|
||||
'name': 'Wwg1wgA (test)',
|
||||
'platform_id': 'club201278078',
|
||||
'category': 'test',
|
||||
'followers': None,
|
||||
'platform': 'Vkontakte',
|
||||
'url': 'https://vk.com/club201278078',
|
||||
'screenname': 'Wwg1wgA',
|
||||
'country': 'FR',
|
||||
'influencer': None,
|
||||
'public': True,
|
||||
'chat': False,
|
||||
'notes': ''}
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
@pytest.fixture(scope='package')
|
||||
@@ -142,6 +157,7 @@ def channel_kwargs():
|
||||
'odysee' : ODYSEE_CHANNEL_KWARGS,
|
||||
'rumble' : RUMBLE_CHANNEL_KWARGS,
|
||||
'telegram' : TELEGRAM_CHANNEL_KWARGS,
|
||||
'twitter' : TWITTER_CHANNEL_KWARGS}
|
||||
'twitter' : TWITTER_CHANNEL_KWARGS,
|
||||
'vkontakte' : VKONTAKTE_CHANNEL_KWARGS}
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
16
tests/scraper/vkontakte.py
Normal file
16
tests/scraper/vkontakte.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from cisticola.base import Channel
|
||||
from cisticola.scraper import VkontakteScraper
|
||||
|
||||
def test_scrape_vkontakte_channel_no_media(controller, channel_kwargs):
|
||||
|
||||
channels = [Channel(**channel_kwargs['vkontakte'])]
|
||||
controller.register_scraper(scraper = VkontakteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = False)
|
||||
|
||||
def test_scrape_vkontakte_channel(controller, channel_kwargs):
|
||||
|
||||
controller.reset_db()
|
||||
|
||||
channels = [Channel(**channel_kwargs['vkontakte'])]
|
||||
controller.register_scraper(scraper = VkontakteScraper())
|
||||
controller.scrape_channels(channels = channels, archive_media = True)
|
||||
Reference in New Issue
Block a user