mirror of
https://github.com/bellingcat/polyphemus.git
synced 2026-06-11 21:08:32 +03:00
converted script into Python package
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,2 +1,5 @@
|
||||
*.csv
|
||||
**/data/
|
||||
**/data/
|
||||
|
||||
**/__pycache__/
|
||||
*.pyc
|
||||
52
examples/scrape.py
Normal file
52
examples/scrape.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
"""Scrape all video and comment data from a specified Odysee channel
|
||||
"""
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
import csv
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from polyphemus.base import OdyseeChannel
|
||||
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
CHANNEL_NAME = 'PatriotFront'
|
||||
OUTPUT_DIR = Path('.').resolve().parents[1]/'data'
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME)
|
||||
|
||||
video_info_list, comment_info_list = odysee_channel.process_all_videos_and_comments()
|
||||
|
||||
channel_df = pd.DataFrame([odysee_channel.info])
|
||||
video_df = pd.DataFrame(video_info_list)
|
||||
comment_df = pd.DataFrame(comment_info_list)
|
||||
|
||||
output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME)
|
||||
os.makedirs(output_subdir, exist_ok = True)
|
||||
|
||||
channel_df.to_csv(
|
||||
path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_channel.csv'),
|
||||
index = False,
|
||||
quoting = csv.QUOTE_NONNUMERIC )
|
||||
|
||||
video_df.to_csv(
|
||||
path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_videos.csv'),
|
||||
index = False,
|
||||
quoting = csv.QUOTE_NONNUMERIC )
|
||||
|
||||
comment_df.to_csv(
|
||||
path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_comments.csv'),
|
||||
index = False,
|
||||
quoting = csv.QUOTE_NONNUMERIC )
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
8
polyphemus/__init__.py
Normal file
8
polyphemus/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
from . import base
|
||||
from . import utils
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
@@ -1,26 +1,20 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
"""Functions and classes for scraping video data from Odysee video platform.
|
||||
"""Base classes and methods for scraping video data from Odysee video platform.
|
||||
"""
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
import json
|
||||
import csv
|
||||
from pathlib import Path
|
||||
import os
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
#TODO Figure out how to reverse-engineer this
|
||||
AUTH_TOKEN = 'BseGAiye641UqUsv4g31ZcUCRiLasv3U'
|
||||
|
||||
CHANNEL_NAME = 'PatriotFront'
|
||||
OUTPUT_DIR = Path('.').resolve().parent/'data'
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
class OdyseeChannel:
|
||||
@@ -57,7 +51,7 @@ class OdyseeChannel:
|
||||
|
||||
info = result['result'][channel_url]
|
||||
|
||||
channel_info = {
|
||||
info = {
|
||||
'channel_id' : info['claim_id'],
|
||||
'title' : info['value']['title'],
|
||||
'created': info['timestamp'],
|
||||
@@ -66,8 +60,8 @@ class OdyseeChannel:
|
||||
'thumbnail_image': info['value']['thumbnail']['url'],
|
||||
'raw' : response.text}
|
||||
|
||||
self._channel_info = channel_info
|
||||
self._channel_id = self._channel_info['channel_id']
|
||||
self.info = info
|
||||
self._channel_id = self.info['channel_id']
|
||||
|
||||
#-------------------------------------------------------------------------#
|
||||
|
||||
@@ -121,7 +115,7 @@ class OdyseeChannel:
|
||||
def process_all_videos(self):
|
||||
|
||||
self.get_all_videos()
|
||||
all_videos_processed = [OdyseeVideo(video)._video_info for video in self._all_videos]
|
||||
all_videos_processed = [OdyseeVideo(video).info for video in self._all_videos]
|
||||
|
||||
return all_videos_processed
|
||||
|
||||
@@ -131,7 +125,7 @@ class OdyseeChannel:
|
||||
|
||||
self.get_all_videos()
|
||||
all_videos = [OdyseeVideo(video) for video in self._all_videos]
|
||||
all_videos_processed = [video._video_info for video in all_videos]
|
||||
all_videos_processed = [video.info for video in all_videos]
|
||||
|
||||
all_comments_processed = []
|
||||
|
||||
@@ -148,19 +142,20 @@ class OdyseeVideo:
|
||||
|
||||
def __init__(self, full_video_info):
|
||||
|
||||
self._video_info = {
|
||||
self.info = {
|
||||
'canonical_url' : full_video_info['canonical_url'],
|
||||
'channel' : full_video_info['signing_channel']['name'],
|
||||
'claim_id' : full_video_info['claim_id'],
|
||||
'created' : full_video_info['value']['release_time'],
|
||||
'description' : full_video_info['value']['description'],
|
||||
'languages' : full_video_info['value']['languages'],
|
||||
'description' : full_video_info['value'].get('description'),
|
||||
'languages' : full_video_info['value'].get('languages'),
|
||||
'tags' : full_video_info['value'].get('tags',[]),
|
||||
'title' : full_video_info['value']['title'],
|
||||
'duration' : full_video_info['value']['video']['duration'],
|
||||
'thumbnail' : full_video_info['value']['thumbnail']['url'],
|
||||
'raw' : json.dumps(full_video_info)}
|
||||
|
||||
self._claim_id = self._video_info ['claim_id']
|
||||
self._claim_id = self.info ['claim_id']
|
||||
|
||||
self.get_views()
|
||||
self.get_video_reactions()
|
||||
@@ -181,7 +176,7 @@ class OdyseeVideo:
|
||||
response = requests.get(api_url, params = params)
|
||||
views = json.loads(response.text)['data'][0]
|
||||
|
||||
self._video_info['views'] = views
|
||||
self.info['views'] = views
|
||||
|
||||
#-------------------------------------------------------------------------#
|
||||
|
||||
@@ -200,8 +195,8 @@ class OdyseeVideo:
|
||||
result = json.loads(response.text)
|
||||
reactions = result['data']['others_reactions'][self._claim_id ]
|
||||
|
||||
self._video_info['likes'] = reactions['like']
|
||||
self._video_info['dislikes'] = reactions['dislike']
|
||||
self.info['likes'] = reactions['like']
|
||||
self.info['dislikes'] = reactions['dislike']
|
||||
|
||||
#-------------------------------------------------------------------------#
|
||||
|
||||
@@ -262,17 +257,40 @@ class OdyseeVideo:
|
||||
def process_all_comments(self):
|
||||
|
||||
self.get_all_comments()
|
||||
all_comments_processed = [OdyseeComment(comment)._comment_info for comment in self._all_comments]
|
||||
all_comments_processed = [OdyseeComment(comment).info for comment in self._all_comments]
|
||||
|
||||
return all_comments_processed
|
||||
|
||||
#-------------------------------------------------------------------------#
|
||||
|
||||
def get_recommended(self, n = 20):
|
||||
|
||||
api_url = 'https://recsys.odysee.com/search'
|
||||
|
||||
name = quote(self.info['title'])
|
||||
|
||||
params = {
|
||||
's':name,
|
||||
'size':str(int(n)),
|
||||
'from':'0',
|
||||
'related_to':self._claim_id}
|
||||
|
||||
response = requests.get(api_url, params = params)
|
||||
result = json.loads(response.text)
|
||||
|
||||
recommended_video_info = [name_to_video_info(r['name']) for r in result]
|
||||
recommended_video_info = [vi for vi in recommended_video_info if vi['value_type'] == 'stream']
|
||||
recommended_videos = [OdyseeVideo(video_info) for video_info in recommended_video_info]
|
||||
|
||||
return recommended_videos
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
class OdyseeComment:
|
||||
|
||||
def __init__(self, full_comment_info):
|
||||
|
||||
self._comment_info = {
|
||||
self.info = {
|
||||
'comment' : full_comment_info['comment'],
|
||||
'created' : full_comment_info['timestamp'],
|
||||
'video_claim_id' : full_comment_info['claim_id'],
|
||||
@@ -329,37 +347,21 @@ def append_comment_reactions(comments):
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
odysee_channel = OdyseeChannel(channel_name = CHANNEL_NAME)
|
||||
|
||||
video_info_list, comment_info_list = odysee_channel.process_all_videos_and_comments()
|
||||
|
||||
channel_df = pd.DataFrame([odysee_channel._channel_info])
|
||||
video_df = pd.DataFrame(video_info_list)
|
||||
comment_df = pd.DataFrame(comment_info_list)
|
||||
|
||||
output_subdir = Path(OUTPUT_DIR, CHANNEL_NAME)
|
||||
os.makedirs(output_subdir, exist_ok = True)
|
||||
|
||||
channel_df.to_csv(
|
||||
path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_channel.csv'),
|
||||
index = False,
|
||||
quoting = csv.QUOTE_NONNUMERIC )
|
||||
|
||||
video_df.to_csv(
|
||||
path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_videos.csv'),
|
||||
index = False,
|
||||
quoting = csv.QUOTE_NONNUMERIC )
|
||||
|
||||
comment_df.to_csv(
|
||||
path_or_buf = Path(output_subdir, f'{CHANNEL_NAME}_comments.csv'),
|
||||
index = False,
|
||||
quoting = csv.QUOTE_NONNUMERIC )
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def name_to_video_info(name):
|
||||
|
||||
url = f"lbry://{name}"
|
||||
|
||||
post_data = {
|
||||
"jsonrpc":"2.0",
|
||||
"method":"resolve",
|
||||
"params":{
|
||||
"urls":[url]}}
|
||||
|
||||
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
|
||||
|
||||
response = requests.post(url = api_url, json = post_data)
|
||||
result = json.loads(response.text)
|
||||
|
||||
return result['result'][url]
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
43
polyphemus/utils.py
Normal file
43
polyphemus/utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
"""Utility functions for scraping video data from Odysee video platform.
|
||||
"""
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
import json
|
||||
|
||||
import requests
|
||||
|
||||
from .base import OdyseeVideo
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def name_to_video_info(name):
|
||||
|
||||
url = f"lbry://{name}"
|
||||
|
||||
post_data = {
|
||||
"jsonrpc":"2.0",
|
||||
"method":"resolve",
|
||||
"params":{
|
||||
"urls":[url]}}
|
||||
|
||||
api_url = 'https://api.na-backend.odysee.com/api/v1/proxy'
|
||||
|
||||
response = requests.post(url = api_url, json = post_data)
|
||||
result = json.loads(response.text)
|
||||
|
||||
return result['result'][url]
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def name_to_video(name):
|
||||
|
||||
video_info = name_to_video_info(name)
|
||||
video = OdyseeVideo(video_info)
|
||||
|
||||
return video
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
36
setup.py
Normal file
36
setup.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
import os.path
|
||||
from setuptools import setup
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
def readme( ):
|
||||
|
||||
with open( os.path.abspath(
|
||||
os.path.join(
|
||||
os.path.dirname( __file__ ),
|
||||
'README.md' ) ) ) as f:
|
||||
|
||||
return f.read( )
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
|
||||
setup(
|
||||
name = 'polyphemus',
|
||||
version = '0.1',
|
||||
description = 'Scraping Odysee video data',
|
||||
long_description = readme( ),
|
||||
author = 'Bellingcat',
|
||||
packages = [
|
||||
'polyphemus' ],
|
||||
install_requires = [
|
||||
'requests >= 2.27.0',
|
||||
'beautifulsoup4 >= 4.10.0',
|
||||
'pandas >= 1.4.0'],
|
||||
include_package_data = True,
|
||||
zip_safe = False )
|
||||
|
||||
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
|
||||
Reference in New Issue
Block a user