diff --git a/analytics/hashtag_frequencies.py b/analytics/hashtag_frequencies.py index 79350db..0bd99ff 100644 --- a/analytics/hashtag_frequencies.py +++ b/analytics/hashtag_frequencies.py @@ -4,6 +4,11 @@ import argparse import matplotlib.pyplot as plt from datetime import datetime +""" +Plots the frequency of hashtags appearing in the set of given posts. +""" + + sys.path.insert(0, '../tiktok_downloader') import file_methods, global_data @@ -26,6 +31,12 @@ def get_hashtags(obj): def get_occurrences(filename, n=1 , sort=True): + """ + Takes the json file containing posts and returns the triplet: + l : total posts in the file + k : list of top n hashtags + v_total : frequency of top n hashtags in l + """ with open(filename) as f: obj = json.load(f) l = len(obj) @@ -34,7 +45,7 @@ def get_occurrences(filename, n=1 , sort=True): if not sort: k = list(tags.keys()) v = list(tags.values()) - return obj, k, v + return obj, k, v else: sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)} k = list(sorted_tags.keys()) @@ -59,12 +70,15 @@ def plot(n, length, k, v, img_folder): def print_occurrences(l, k, v): + """ + Prints the top n hashtags with their frequencies and the ratio of occurrences and total posts, all to the shell. + """ row_number = 0 total_posts = l print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts({l}))')) #print(f'Hashtag Occurrences Frequency(Occurances/Total-Posts)') for key,value in zip(k, v): - ratio = value/total_posts + ratio = value/total_posts print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio)) #print(f'{row_number}\t{key}\t\t{value}\t\t{ratio:.3f}') row_number += 1 @@ -72,6 +86,9 @@ def print_occurrences(l, k, v): def save_plot(plt, img_folder): + """ + Saves the plot to a png file in the folder /data/imgs/ + """ try: now = datetime.now() current_time = now.strftime("%Y_%m_%d_%H_%M_%S") @@ -83,6 +100,13 @@ def save_plot(plt, img_folder): if __name__ == "__main__": + """ + Option "n" specifies how many hashtags does the user wants to plot. + "-d" option prints the hashtag frequencies on the shell + "-p" option plots the hashtag frequencies and saves as a png file in the folder /data/imgs/ + + The function get_occurances is triggered to compute and return the top n occurances and the hashtags. + """ img_folder = global_data.IMAGES file_methods.check_file(img_folder, "dir") parser = argparse.ArgumentParser() diff --git a/tiktok_downloader/data_methods.py b/tiktok_downloader/data_methods.py index e99c79e..86b8633 100644 --- a/tiktok_downloader/data_methods.py +++ b/tiktok_downloader/data_methods.py @@ -4,12 +4,20 @@ from datetime import datetime import global_data import file_methods +""" +The file contains several functions that perform data processing related tasks. +""" + Difference = namedtuple("Difference", "new_ids size") Total = namedtuple("Total", "total unique") def get_difference(tag, file, ids): + """ + Compares two sets of ids and returns the difference of the two sets. + Purpose - user to filter out the new ids by comparing the set of id list (ids/post_ids.json or videos_ids.json) and the list of newly downloaded ids. + """ maiden_entry = False current_id_data = file_methods.get_data(file) if tag in current_id_data: @@ -30,6 +38,9 @@ def get_difference(tag, file, ids): def extract_posts(settings, file_name, tag): + """ + Takes the downloaded file by the tiktok-scraper that contains the posts, and returns the new posts after comparing it the list of posts (from the file ids/post_ids.json) already downloaded. + """ ids = [] posts = [] new_posts = [] @@ -40,7 +51,7 @@ def extract_posts(settings, file_name, tag): if not ids: print(f"WARNING: no posts were found for {tag} in the file - {file_name}") return - + status = file_methods.check_existence(settings["post_ids"], "file") if not status: new_data = (ids, posts) @@ -64,6 +75,9 @@ def extract_posts(settings, file_name, tag): def extract_videos(settings, tag, download_list): + """ + Tiktok-scraper downloads the videos and puts them in a folder - the list of ids of the downloaded videos is fed to this function as download_list. The function returns the set of new videos after comparing it the list of videos (from the file ids/videos_ids.json) already downloaded. + """ status = file_methods.check_existence(settings["video_ids"], "file") if not status: new_data = download_list @@ -82,6 +96,9 @@ def extract_videos(settings, tag, download_list): def update_posts(file_path, file_type, new_data, tag=None): + """ + Updates the list of post ids (in the file ids/post_ids.json) with the ids of the new posts. + """ try: status = file_methods.check_existence(file_path, file_type) if not tag: @@ -94,6 +111,9 @@ def update_posts(file_path, file_type, new_data, tag=None): def update_videos(settings, new_data, tag): + """ + Updates the list of video ids (in the file ids/video_ids.json) with the ids of the new videos. + """ file_path = settings["video_ids"] file_methods.check_file(file_path, "file") log = file_methods.id_writer(file_path, new_data, tag, True) @@ -102,6 +122,9 @@ def update_videos(settings, new_data, tag): def get_total_posts(file_path, tag): + """ + Returns total count of ids in a id list along with the number of unique ids among them. + """ status = file_methods.check_existence(file_path, "file") if not status: raise OSError("{file_path} not found!") @@ -114,6 +137,9 @@ def get_total_posts(file_path, tag): def print_total(file_path, tag, data_type): + """ + Prints the total count for posts or videos for a hashtag. Calls the function get_total_posts for sanity check that there are no repeating ids in the id lists. + """ total = get_total_posts(file_path, tag) if (total.total == total.unique): print(f"Total {data_type} for the hashtag {tag} are: {total.total}") @@ -121,5 +147,3 @@ def print_total(file_path, tag, data_type): else: print(f"WARNING: out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong...") return - - diff --git a/tiktok_downloader/file_methods.py b/tiktok_downloader/file_methods.py index 915fb31..c0a1309 100644 --- a/tiktok_downloader/file_methods.py +++ b/tiktok_downloader/file_methods.py @@ -4,7 +4,15 @@ import global_data import data_methods +""" +The file contains the functions that operate on files, such as writing or reading from files etc. +""" + + def create_file(name, file_type): + """ + Creates a file or directory. + """ if (file_type == "dir"): os.makedirs(name, mode=0o777) elif (file_type == "file"): @@ -15,6 +23,9 @@ def create_file(name, file_type): def check_existence(file_path, file_type): + """ + Checks the existence of a file or a directory. If not found, returns a False, else returns a true. + """ if (file_type == "file"): if os.path.isfile(file_path): return True @@ -30,24 +41,32 @@ def check_existence(file_path, file_type): def check_file(file_path, file_type): + """ + Creates a file or directory, if not found. Else, returns nothing. + """ status = check_existence(file_path, file_type) if not status: - create_file(file_path, file_type) + create_file(file_path, file_type) return def download_posts(settings, tag): + """ + Runs the tiktok-scraper command to download posts for a given hashtag. + Returns the path to the downloaded file of posts. If no file was downloaded, prints the error and returns nothing in order to move on. + os.chdir is used to execute shell commands in the right folders and then reused to come back to the original folder of execution of run_downloader script. + """ path = os.path.join(settings["data"], tag, settings["posts"]) os.chdir(path) try: - tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json'" + tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json'" result = subprocess.run([tiktok_command], capture_output=True, shell=True) if result.stdout: new_file = result.stdout.decode('utf-8').split()[-1] if ("json" in new_file): os.chdir("../../../tiktok_downloader") - return new_file + return new_file else: print(f"ERROR: Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file!!!!") os.chdir("../../../tiktok_downloader") @@ -61,11 +80,16 @@ def download_posts(settings, tag): def download_videos(settings, tag): + """ + Runs the tiktok-scraper command to download videos for a given hashtag. Note that all the videos are downloaded that are returned by the tiktok api and as a result, its a time and data consuming process. + The list of downloaded video ids is constucted and returned if the downloaded folder contains at least 1 video. + os.chdir is used to execute shell commands in the right folders and then reused to come back to the original folder of execution of run_downloader script. + """ path = os.path.join(settings["data"], tag, settings["videos"]) os.chdir(path) try: - # tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d" - tiktok_command = f"tiktok-scraper hashtag {tag} -d" + # tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d" + tiktok_command = f"tiktok-scraper hashtag {tag} -d" result = subprocess.run([tiktok_command], capture_output=True, shell=True) if result.stdout: downloaded_list_tmp = os.listdir(f"./#{tag}") @@ -74,7 +98,7 @@ def download_videos(settings, tag): for file in downloaded_list_tmp: file = file[0:-4] downloaded_list.append(file) - + os.chdir("../../../tiktok_downloader") return downloaded_list else: @@ -85,22 +109,33 @@ def download_videos(settings, tag): os.chdir("../../../tiktok_downloader") print(f"WARNING: Something went wrong with the tiktok-scraper video download for the {tag} !!!!") return - + except: raise def get_data(file_path): + """ + Reads the json file and retuns the read data. + """ with open(file_path, "r") as f: data = json.load(f) return data def dump_data(file_path, data): + """ + Writes the data to the json file. + """ with open(file_path, "w") as f: json.dump(data, f) - return + return def log_writer(log_data): + """ + Creates the dictionary of total downloads (posts and videos) per hashtag. + Example : {timstamp : {hashtag : { videos : number_of_new_videos , posts : number_of_new_posts } } } + Writes the dictionary to the log file (logs/log.json). + """ total = 0 try: log_dict = {} @@ -132,6 +167,9 @@ def log_writer(log_data): def id_writer(file_path, new_data, tag, status): + """ + Writes the list of new ids to the post_ids or video_ds files. + """ try: total = len(new_data) if status: @@ -140,7 +178,7 @@ def id_writer(file_path, new_data, tag, status): if tag in data: data[tag] += new_data else: - data[tag]= new_data + data[tag]= new_data dump_data(file_path, data) except json.decoder.JSONDecodeError: data = { tag : new_data } @@ -155,6 +193,9 @@ def id_writer(file_path, new_data, tag, status): def post_writer(file_path, new_data, status): + """ + Writes the new posts in the post file of the given hashtag (/data/{hashtag}/posts/data.json) + """ try: total = len(new_data) if status: @@ -174,6 +215,9 @@ def post_writer(file_path, new_data, status): def delete_file(file_path, file_type): + """ + Deletes the directory or the file. + """ if not check_existence(file_path, file_type): print(f"ERROR: Attempt to delete failed. {file_path} does not exist!!!") elif (file_type == "file"): @@ -190,12 +234,16 @@ def delete_file(file_path, file_type): def clean_video_files(settings, tag, new_data=None): + """ + Moves the new videos from the tiktok-scraper video folder to /data/{hashtag}/videos/ + Deletes the residual tiktok-scraper video folder. + """ try: if new_data: for file in new_data: settings["videos_from"] = settings['data'] + f"/{tag}/videos/#{tag}/{file}.mp4" subprocess.call(f"mv {settings['videos_from']} {settings['videos_to']}", shell=True) - + subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True) print(f"Successfully deleted the folder {settings['videos_delete']} folder of videos.") except: diff --git a/tiktok_downloader/global_data.py b/tiktok_downloader/global_data.py index e3d2221..57df4f1 100644 --- a/tiktok_downloader/global_data.py +++ b/tiktok_downloader/global_data.py @@ -1,3 +1,8 @@ +""" +Contains global constants relating to paths and operational parameters such as sleep time between consecutive tiktok-scraper calls. +""" + + # Directories DATA = "../data" IDS = "ids" @@ -37,4 +42,3 @@ PARAMETERS = { # "number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper. "sleep" : 8 } - diff --git a/tiktok_downloader/run_downloader.py b/tiktok_downloader/run_downloader.py index 494aa9f..d2e53bf 100644 --- a/tiktok_downloader/run_downloader.py +++ b/tiktok_downloader/run_downloader.py @@ -8,6 +8,39 @@ import file_methods import data_methods +""" +The run_downloader.py dowloads data using the tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper). +1. "-p" option is used by the user to download posts only +2. "-v" option is use to download videos only +3. "-p -v" is used to download posts and videos +4. "--h" is used to specify a list of hashtags as arguments +5. "-f" option is used to read the list of hashtags from the user specified file + +Example: + 1. The command "python3 run_downloader.py --h london paris newyork -p" will download posts for hashtags london, paris and newyork. + 2. The command "python3 run_downloader.py -f hashtag_list -p -v" will download posts and videos for hashtags in the file hashtag_list. + + +The downloaded data is stored in the the data folder. The data is folder is organized as follows: + 1. the log subfolder contains the log.json that records total downloads (posts and videos) for each hashtag with a timestamp of when the script was run. + 2. the ids subfolder contains post_ids.json and video_ids.json that keep the record of post and video ids that are currently in the data set. This helps to filter out only new posts every time tiktok-scraper is run and only those new posts (or videos) are then stored in the data folder. + 3. Each hashtag has a subfolder by its name containing two subfolders, one each for posts and videos. + + +This scripts runs the function get_data in main which in turn triggers the following sequence: + 1. get_posts function is triggered if the user wants to download posts + 2. get_videos function is triggered if the user wants to download videos + 3. both functions above are sequentially triggered if the user wants to download both posts and videos. + 4. After the data is downloaded the log_writer is triggered to log the total number of posts and videos downloaded. + + +------------Files-------------- +global_data - contains global constants relating to paths etc. +data_methods - this file contains data processing methods +file_methods - this file contains methods to write and update data in files +hashtag_list - this file contains the list of hashtags that the user wants to download data for. +""" + command = "python3 post_downloader.py " @@ -37,6 +70,10 @@ def create_parser(): def set_download_settings(download_data_type): + """ + Loads the constants from global_data into the dict called settings and returns it. + Purpose - easy access to global constants by various functions. + """ settings = {} settings["data"] = global_data.FILES["data"] settings["ids"] = global_data.FILES["ids"] @@ -54,7 +91,6 @@ def set_download_settings(download_data_type): elif download_data_type == "videos": settings["videos"] = global_data.FILES["videos"] settings["video_ids"] = global_data.FILES["video_ids"] - settings["number_of_videos"] = global_data.PARAMETERS["number_of_videos"] return settings elif download_data_type == "posts-videos": settings["posts"] = global_data.FILES["posts"] @@ -62,7 +98,6 @@ def set_download_settings(download_data_type): settings["data_file"] = global_data.FILES["data_file"] settings["videos"] = global_data.FILES["videos"] settings["video_ids"] = global_data.FILES["video_ids"] - settings["number_of_videos"] = global_data.PARAMETERS["number_of_videos"] return settings else: print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.") @@ -71,6 +106,11 @@ def set_download_settings(download_data_type): def get_posts(settings, tag): + """ + 1. calls download_posts in file_methods.py to get the posts for a given hashtag + 2. calls extract_posts from data_methods.py to extract new posts if any + 3. calls update_posts from data_methods.py to update the id-list with the ids of newly downloaded posts. + """ file_path = file_methods.download_posts(settings, tag) log = () if file_path: @@ -80,12 +120,18 @@ def get_posts(settings, tag): data_methods.update_posts(data_file, "file", new_data[1]) log = data_methods.update_posts(settings["post_ids"], "file", new_data[0], tag) file_methods.delete_file(file_path, "file") - + return log -def get_videos(settings, tag): +def get_videos(settings, tag): + """ + 1. calls download_videos in file_methods.py to get the videos for a given hashtag + 2. calls extract_videos from data_methods.py to extract new videos if any + 3. calls update_videos from data_methods.py to update the id-list with the ids of newly downloaded videos. + 4. the clean_video_files function deletes the residual video folder after the data processing + """ log = () download_list = file_methods.download_videos(settings, tag) if download_list: @@ -100,11 +146,15 @@ def get_videos(settings, tag): def get_data(hashtags, download_data_type): + """ + The function checks for the user option "-p", "-v" or both and then + triggers the functions get_posts, get_videos or both, respectively. + """ counter = 0 total_hashtags = len(hashtags) total_hashtags_offset = total_hashtags - 1 log_data = [] - + if download_data_type == "posts": settings = set_download_settings(download_data_type) while counter < total_hashtags: @@ -116,7 +166,7 @@ def get_data(hashtags, download_data_type): log = ( res[0], ( "posts", res[1] ) ) log_data.append(log) data_methods.print_total(settings["post_ids"], tag, download_data_type) - + counter += 1 if counter < total_hashtags_offset: time.sleep(settings["sleep"]) @@ -132,7 +182,7 @@ def get_data(hashtags, download_data_type): res = ( res[0], ( "videos", res[1])) log_data.append(res) data_methods.print_total(settings["video_ids"], tag, download_data_type) - + counter += 1 if counter < total_hashtags_offset: time.sleep(settings["sleep"]) @@ -154,7 +204,7 @@ def get_data(hashtags, download_data_type): res = ( res[0], (req[0], res[1]) ) log_data.append(res) data_methods.print_total(settings[req[1]], tag, req[0]) - + if req_counter < total_reqs_offset: time.sleep(settings["sleep"]) req_counter += 1 @@ -169,6 +219,9 @@ def get_data(hashtags, download_data_type): def get_hashtags(file_name, hashtag_list): + """ + Loads and returns the list of hashtags from user specified file. + """ try: from hashtag_list import hashtag_list return hashtag_list @@ -184,7 +237,7 @@ if __name__ == "__main__": if not (args.h or args.f): parser.error("No hashtags were given, please use either --h option or -f to provide hashtags.") sys.exit() - + if not (args.p or args.v): parser.error("No argument given, please specify either -p for posts or -v videos or both.") sys.exit() @@ -206,8 +259,8 @@ if __name__ == "__main__": download_data_type = "posts" else: download_data_type = "videos" - - try: + + try: log_data = get_data(hashtags, download_data_type) if log_data: file_methods.log_writer(log_data)