Created scraper.py and test_find_multiple_authors.py

2026-06-07 19:18:31 +03:00 · 2022-09-30 21:36:06 +02:00
parent f9ae8cd851
commit 1b6376ff00
2 changed files with 74 additions and 0 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,64 @@
+import argparse
+from collections import defaultdict
+from itertools import combinations
+from itertools import islice
+from youtube_comment_downloader import YoutubeCommentDownloader
+
+
+def get_comment_dict(video_url, max_comments=100):
+    """
+    Creates a dictionary mapping comment-authors 
+    to a list of their comments
+    """
+    downloader = YoutubeCommentDownloader()
+    comment_dict = defaultdict(list)
+    comments = downloader.get_comments_from_url(video_url)
+    for comment in islice(comments, max_comments):
+        comment_dict[comment['author']].append(comment)
+
+    return comment_dict
+
+def find_multiple_authors(video_urls):
+
+    # video_dict maps the video url id to the 
+    # comment dict for that video
+    video_dict = {}
+    for url in video_urls:
+        vid_uid = url.split('=')[1].split('&')[0]
+        print('Getting comments for video: ', vid_uid)
+        video_dict[vid_uid] = get_comment_dict(url)
+
+    # Iterate over the possible combinations of videos
+    for item1, item2 in combinations(video_dict.items(), r=2):
+        # Unpack from tuple
+        vid_id1, dict1 = item1
+        vid_id2, dict2 = item2
+        # Use set intersection to find common authors
+        common_authors = dict1.keys() & dict2.keys()
+        print(f'Videos: {vid_id1} & {vid_id2} have {len(common_authors)}')
+        print(common_authors)
+        for author in common_authors:
+            print(f'Author: {author}')
+            print(f'Video {vid_id1} comments: ')
+            # Iterate over each comment author left on video1
+            # and print first 100 chars
+            for i, comment in enumerate(dict1[author]):
+                print(i+1, comment['text'][:100])
+            print(f'Video {vid_id2} comments: ')
+            for i, comment in enumerate(dict2[author]):
+                print(i+1, comment['text'][:100])
+
+            print()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog='scraper.py')
+    parser.add_argument('videos', choices=['youtube_video_url_1', 'youtube_video_url_2', 'youtube_video_url_3'])
+    args = parser.parse_args()
+    vids = [
+        'https://www.youtube.com/watch?v=C1Gn85NKrCU',
+        'https://www.youtube.com/watch?v=UMzKD73cs3c',
+        'https://www.youtube.com/watch?v=e7VOQ1l20eo'
+    ]
+    find_multiple_authors(args.videos)
+
--- a/test_find_multiple_authors.py
+++ b/test_find_multiple_authors.py
@@ -0,0 +1,10 @@
+from scraper import find_multiple_authors
+     
+def test_find_multiple_users():
+    # List contains, videos from Google's YouTube channel
+    vids = [
+        'https://www.youtube.com/watch?v=8qGV_O_y4DA',
+        'https://www.youtube.com/watch?v=WSkETCRe7Ic',
+        'https://www.youtube.com/watch?v=cdgQpa1pUUE'
+    ]
+    find_multiple_authors(vids)