Created scraper.py and test_find_multiple_authors.py

This commit is contained in:
Richard Mwewa
2022-09-30 21:36:06 +02:00
parent f9ae8cd851
commit 1b6376ff00
2 changed files with 74 additions and 0 deletions

64
scraper.py Normal file
View File

@@ -0,0 +1,64 @@
import argparse
from collections import defaultdict
from itertools import combinations
from itertools import islice
from youtube_comment_downloader import YoutubeCommentDownloader
def get_comment_dict(video_url, max_comments=100):
"""
Creates a dictionary mapping comment-authors
to a list of their comments
"""
downloader = YoutubeCommentDownloader()
comment_dict = defaultdict(list)
comments = downloader.get_comments_from_url(video_url)
for comment in islice(comments, max_comments):
comment_dict[comment['author']].append(comment)
return comment_dict
def find_multiple_authors(video_urls):
# video_dict maps the video url id to the
# comment dict for that video
video_dict = {}
for url in video_urls:
vid_uid = url.split('=')[1].split('&')[0]
print('Getting comments for video: ', vid_uid)
video_dict[vid_uid] = get_comment_dict(url)
# Iterate over the possible combinations of videos
for item1, item2 in combinations(video_dict.items(), r=2):
# Unpack from tuple
vid_id1, dict1 = item1
vid_id2, dict2 = item2
# Use set intersection to find common authors
common_authors = dict1.keys() & dict2.keys()
print(f'Videos: {vid_id1} & {vid_id2} have {len(common_authors)}')
print(common_authors)
for author in common_authors:
print(f'Author: {author}')
print(f'Video {vid_id1} comments: ')
# Iterate over each comment author left on video1
# and print first 100 chars
for i, comment in enumerate(dict1[author]):
print(i+1, comment['text'][:100])
print(f'Video {vid_id2} comments: ')
for i, comment in enumerate(dict2[author]):
print(i+1, comment['text'][:100])
print()
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog='scraper.py')
parser.add_argument('videos', choices=['youtube_video_url_1', 'youtube_video_url_2', 'youtube_video_url_3'])
args = parser.parse_args()
vids = [
'https://www.youtube.com/watch?v=C1Gn85NKrCU',
'https://www.youtube.com/watch?v=UMzKD73cs3c',
'https://www.youtube.com/watch?v=e7VOQ1l20eo'
]
find_multiple_authors(args.videos)

View File

@@ -0,0 +1,10 @@
from scraper import find_multiple_authors
def test_find_multiple_users():
# List contains, videos from Google's YouTube channel
vids = [
'https://www.youtube.com/watch?v=8qGV_O_y4DA',
'https://www.youtube.com/watch?v=WSkETCRe7Ic',
'https://www.youtube.com/watch?v=cdgQpa1pUUE'
]
find_multiple_authors(vids)