youtube-comment-scraper/scraper.py

import tqdm
import requests
import argparse
from collections import defaultdict
from itertools import combinations
from itertools import islice
from youtube_comment_downloader import YoutubeCommentDownloader


program_version_number = '2022.1.0.0'
update_check_endpoint = "https://api.github.com/repos/rly0nheart/YouTube-Comment-Scraper/releases/latest"

def notice():
    notice_msg = f"""
    YouTube-Comment-Scraper {program_version_number} Copyright (C) 2022  Richard Mwewa

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    """
    print(notice_msg)


def check_and_get_updates():
    notice()
    """
    Checks if the release tag matches the current tag in the program
    If there's a match, ignore
    """
    response = requests.get(update_check_endpoint).json()
    if response['tag_name'] == program_version_number:
        pass
    else:
        update_prompt = input(f"[?] A new release is available ({response['tag_name']}). Would you like to install it? (y/n) ")
        if update_prompt.lower() == "y":
            files_to_update = ['scraper.py', 'test_find_multiple_users.py', 'README.md', 'requirements.txt']
            for file in tqdm(files_to_update, desc=f'Updating'):
                data = requests.get(f'https://raw.githubusercontent.com/rly0nheart/YouTube-Comment-Scraper/master/{file}')
                with open(file, "wb") as f:
                    f.write(data.content)
                    f.close()
            print(f"[+] Updated: Re-run program.");exit()
        else:
            pass


def get_comment_dict(video_url, max_comments=100):
    """
    Creates a dictionary mapping comment-authors
    to a list of their comments
    """
    downloader = YoutubeCommentDownloader()
    comment_dict = defaultdict(list)
    comments = downloader.get_comments_from_url(video_url)
    for comment in islice(comments, max_comments):
        comment_dict[comment['author']].append(comment)

    return comment_dict

def find_multiple_authors(video_urls):

    # video_dict maps the video url id to the
    # comment dict for that video
    video_dict = {}
    for url in video_urls:
        vid_uid = url.split('=')[1].split('&')[0]
        print('[~] Getting comments for video: ', vid_uid)
        video_dict[vid_uid] = get_comment_dict(url)

    # Iterate over the possible combinations of videos
    for item1, item2 in combinations(video_dict.items(), r=2):
        # Unpack from tuple
        vid_id1, dict1 = item1
        vid_id2, dict2 = item2
        # Use set intersection to find common authors
        common_authors = dict1.keys() & dict2.keys()
        print(f'Videos: {vid_id1} & {vid_id2} have {len(common_authors)}')
        print(common_authors)
        for author in common_authors:
            print(f'[+] Author: {author}')
            print(f'[+] Video {vid_id1} comments: ')
            # Iterate over each comment author left on video1
            # and print first 100 chars
            for i, comment in enumerate(dict1[author]):
                print(i+1, comment['text'][:100])
            print(f'[+] Video {vid_id2} comments: ')
            for i, comment in enumerate(dict2[author]):
                print(i+1, comment['text'][:100])

            print()


if __name__ == '__main__':
    parser = argparse.ArgumentParser('YouTube-Comment-Scraper — by Richard Mwewa', epilog='scrapes youtube comments and checks whether a user commented on the given videos')
    parser.add_argument('videos', nargs='+', help='list of youtube video urls')
    parser.add_argument('-v', '--version', version='2022.1.0.0', action='version')
    args = parser.parse_args()
    try:
        check_and_get_updates()
        find_multiple_authors(args.videos)

    except KeyboardInterrupt:
        print('[!] Process interrupted with Ctrl+C.')

    except Exception as e:
        print('[!] An error occurred:', e)