From 1b6376ff009a33e80c41214430822a8f4a657fa1 Mon Sep 17 00:00:00 2001
From: Richard Mwewa <74001397+rly0nheart@users.noreply.github.com>
Date: Fri, 30 Sep 2022 21:36:06 +0200
Subject: [PATCH] Created scraper.py and test_find_multiple_authors.py

---
 scraper.py                    | 64 +++++++++++++++++++++++++++++++++++
 test_find_multiple_authors.py | 10 ++++++
 2 files changed, 74 insertions(+)
 create mode 100644 scraper.py
 create mode 100644 test_find_multiple_authors.py

diff --git a/scraper.py b/scraper.py
new file mode 100644
index 0000000..c3c475d
--- /dev/null
+++ b/scraper.py
@@ -0,0 +1,64 @@
+import argparse
+from collections import defaultdict
+from itertools import combinations
+from itertools import islice
+from youtube_comment_downloader import YoutubeCommentDownloader
+
+
+def get_comment_dict(video_url, max_comments=100):
+    """
+    Creates a dictionary mapping comment-authors 
+    to a list of their comments
+    """
+    downloader = YoutubeCommentDownloader()
+    comment_dict = defaultdict(list)
+    comments = downloader.get_comments_from_url(video_url)
+    for comment in islice(comments, max_comments):
+        comment_dict[comment['author']].append(comment)
+
+    return comment_dict
+
+def find_multiple_authors(video_urls):
+
+    # video_dict maps the video url id to the 
+    # comment dict for that video
+    video_dict = {}
+    for url in video_urls:
+        vid_uid = url.split('=')[1].split('&')[0]
+        print('Getting comments for video: ', vid_uid)
+        video_dict[vid_uid] = get_comment_dict(url)
+
+    # Iterate over the possible combinations of videos
+    for item1, item2 in combinations(video_dict.items(), r=2):
+        # Unpack from tuple
+        vid_id1, dict1 = item1
+        vid_id2, dict2 = item2
+        # Use set intersection to find common authors
+        common_authors = dict1.keys() & dict2.keys()
+        print(f'Videos: {vid_id1} & {vid_id2} have {len(common_authors)}')
+        print(common_authors)
+        for author in common_authors:
+            print(f'Author: {author}')
+            print(f'Video {vid_id1} comments: ')
+            # Iterate over each comment author left on video1
+            # and print first 100 chars
+            for i, comment in enumerate(dict1[author]):
+                print(i+1, comment['text'][:100])
+            print(f'Video {vid_id2} comments: ')
+            for i, comment in enumerate(dict2[author]):
+                print(i+1, comment['text'][:100])
+
+            print()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog='scraper.py')
+    parser.add_argument('videos', choices=['youtube_video_url_1', 'youtube_video_url_2', 'youtube_video_url_3'])
+    args = parser.parse_args()
+    vids = [
+        'https://www.youtube.com/watch?v=C1Gn85NKrCU',
+        'https://www.youtube.com/watch?v=UMzKD73cs3c',
+        'https://www.youtube.com/watch?v=e7VOQ1l20eo'
+    ]
+    find_multiple_authors(args.videos)
+
diff --git a/test_find_multiple_authors.py b/test_find_multiple_authors.py
new file mode 100644
index 0000000..8eef190
--- /dev/null
+++ b/test_find_multiple_authors.py
@@ -0,0 +1,10 @@
+from scraper import find_multiple_authors
+     
+def test_find_multiple_users():
+    # List contains, videos from Google's YouTube channel
+    vids = [
+        'https://www.youtube.com/watch?v=8qGV_O_y4DA',
+        'https://www.youtube.com/watch?v=WSkETCRe7Ic',
+        'https://www.youtube.com/watch?v=cdgQpa1pUUE'
+    ]
+    find_multiple_authors(vids)
\ No newline at end of file