From 1b6376ff009a33e80c41214430822a8f4a657fa1 Mon Sep 17 00:00:00 2001 From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com> Date: Fri, 30 Sep 2022 21:36:06 +0200 Subject: [PATCH] Created scraper.py and test_find_multiple_authors.py --- scraper.py | 64 +++++++++++++++++++++++++++++++++++ test_find_multiple_authors.py | 10 ++++++ 2 files changed, 74 insertions(+) create mode 100644 scraper.py create mode 100644 test_find_multiple_authors.py diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..c3c475d --- /dev/null +++ b/scraper.py @@ -0,0 +1,64 @@ +import argparse +from collections import defaultdict +from itertools import combinations +from itertools import islice +from youtube_comment_downloader import YoutubeCommentDownloader + + +def get_comment_dict(video_url, max_comments=100): + """ + Creates a dictionary mapping comment-authors + to a list of their comments + """ + downloader = YoutubeCommentDownloader() + comment_dict = defaultdict(list) + comments = downloader.get_comments_from_url(video_url) + for comment in islice(comments, max_comments): + comment_dict[comment['author']].append(comment) + + return comment_dict + +def find_multiple_authors(video_urls): + + # video_dict maps the video url id to the + # comment dict for that video + video_dict = {} + for url in video_urls: + vid_uid = url.split('=')[1].split('&')[0] + print('Getting comments for video: ', vid_uid) + video_dict[vid_uid] = get_comment_dict(url) + + # Iterate over the possible combinations of videos + for item1, item2 in combinations(video_dict.items(), r=2): + # Unpack from tuple + vid_id1, dict1 = item1 + vid_id2, dict2 = item2 + # Use set intersection to find common authors + common_authors = dict1.keys() & dict2.keys() + print(f'Videos: {vid_id1} & {vid_id2} have {len(common_authors)}') + print(common_authors) + for author in common_authors: + print(f'Author: {author}') + print(f'Video {vid_id1} comments: ') + # Iterate over each comment author left on video1 + # and print first 100 chars + for i, comment in enumerate(dict1[author]): + print(i+1, comment['text'][:100]) + print(f'Video {vid_id2} comments: ') + for i, comment in enumerate(dict2[author]): + print(i+1, comment['text'][:100]) + + print() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(prog='scraper.py') + parser.add_argument('videos', choices=['youtube_video_url_1', 'youtube_video_url_2', 'youtube_video_url_3']) + args = parser.parse_args() + vids = [ + 'https://www.youtube.com/watch?v=C1Gn85NKrCU', + 'https://www.youtube.com/watch?v=UMzKD73cs3c', + 'https://www.youtube.com/watch?v=e7VOQ1l20eo' + ] + find_multiple_authors(args.videos) + diff --git a/test_find_multiple_authors.py b/test_find_multiple_authors.py new file mode 100644 index 0000000..8eef190 --- /dev/null +++ b/test_find_multiple_authors.py @@ -0,0 +1,10 @@ +from scraper import find_multiple_authors + +def test_find_multiple_users(): + # List contains, videos from Google's YouTube channel + vids = [ + 'https://www.youtube.com/watch?v=8qGV_O_y4DA', + 'https://www.youtube.com/watch?v=WSkETCRe7Ic', + 'https://www.youtube.com/watch?v=cdgQpa1pUUE' + ] + find_multiple_authors(vids) \ No newline at end of file