add comments

2026-06-08 03:18:31 +03:00 · 2022-02-25 16:55:02 +01:00
parent d3edf604a9
commit b958ee52fe
5 changed files with 180 additions and 27 deletions
--- a/analytics/hashtag_frequencies.py
+++ b/analytics/hashtag_frequencies.py
@@ -4,6 +4,11 @@ import argparse
 import matplotlib.pyplot as plt
 from datetime import datetime

+"""
+Plots the frequency of hashtags appearing in the set of given posts.
+"""
+
+
 sys.path.insert(0, '../tiktok_downloader')
 import file_methods, global_data

@@ -26,6 +31,12 @@ def get_hashtags(obj):


 def get_occurrences(filename, n=1 , sort=True):
+    """
+    Takes the json file containing posts and returns the triplet:
+    l : total posts in the file
+    k : list of top n hashtags
+    v_total : frequency of top n hashtags in l
+    """
    with open(filename) as f:
        obj = json.load(f)
        l = len(obj)
@@ -34,7 +45,7 @@ def get_occurrences(filename, n=1 , sort=True):
        if not sort:
            k = list(tags.keys())
            v = list(tags.values())
-            return obj, k, v 
+            return obj, k, v
        else:
            sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)}
            k = list(sorted_tags.keys())
@@ -59,12 +70,15 @@ def plot(n, length, k, v, img_folder):


 def print_occurrences(l, k, v):
+    """
+    Prints the top n hashtags with their frequencies and the ratio of occurrences and total posts, all to the shell.
+    """
    row_number = 0
    total_posts = l
    print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts({l}))'))
    #print(f'Hashtag                  Occurrences                 Frequency(Occurances/Total-Posts)')
    for key,value in zip(k, v):
-        ratio = value/total_posts 
+        ratio = value/total_posts
        print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio))
        #print(f'{row_number}\t{key}\t\t{value}\t\t{ratio:.3f}')
        row_number += 1
@@ -72,6 +86,9 @@ def print_occurrences(l, k, v):


 def save_plot(plt, img_folder):
+    """
+    Saves the plot to a png file in the folder /data/imgs/
+    """
    try:
        now = datetime.now()
        current_time = now.strftime("%Y_%m_%d_%H_%M_%S")
@@ -83,6 +100,13 @@ def save_plot(plt, img_folder):


 if __name__ == "__main__":
+    """
+    Option "n" specifies how many hashtags does the user wants to plot.
+    "-d" option prints the hashtag frequencies on the shell
+    "-p" option plots the hashtag frequencies and saves as a png file in the folder /data/imgs/
+
+    The function get_occurances is triggered to compute and return the top n occurances and the hashtags.
+    """
    img_folder = global_data.IMAGES
    file_methods.check_file(img_folder, "dir")
    parser = argparse.ArgumentParser()
--- a/tiktok_downloader/data_methods.py
+++ b/tiktok_downloader/data_methods.py
@@ -4,12 +4,20 @@ from datetime import datetime
 import global_data
 import file_methods

+"""
+The file contains several functions that perform data processing related tasks.
+"""
+

 Difference = namedtuple("Difference", "new_ids size")
 Total = namedtuple("Total", "total unique")


 def get_difference(tag, file, ids):
+    """
+    Compares two sets of ids and returns the difference of the two sets.
+    Purpose - user to filter out the new ids by comparing the set of id list (ids/post_ids.json or videos_ids.json) and the list of newly downloaded ids.
+    """
    maiden_entry = False
    current_id_data = file_methods.get_data(file)
    if tag in current_id_data:
@@ -30,6 +38,9 @@ def get_difference(tag, file, ids):


 def extract_posts(settings, file_name, tag):
+    """
+    Takes the downloaded file by the tiktok-scraper that contains the posts, and returns the new posts after comparing it the list of posts (from the file ids/post_ids.json) already downloaded.
+    """
    ids = []
    posts = []
    new_posts = []
@@ -40,7 +51,7 @@ def extract_posts(settings, file_name, tag):
    if not ids:
        print(f"WARNING: no posts were found for {tag} in the file - {file_name}")
        return
-   
+
    status = file_methods.check_existence(settings["post_ids"], "file")
    if not status:
        new_data = (ids, posts)
@@ -64,6 +75,9 @@ def extract_posts(settings, file_name, tag):


 def extract_videos(settings, tag, download_list):
+    """
+    Tiktok-scraper downloads the videos and puts them in a folder - the list of ids of the downloaded videos is fed to this function as download_list. The function returns the set of new videos after comparing it the list of videos (from the file ids/videos_ids.json) already downloaded.
+    """
    status = file_methods.check_existence(settings["video_ids"], "file")
    if not status:
        new_data = download_list
@@ -82,6 +96,9 @@ def extract_videos(settings, tag, download_list):


 def update_posts(file_path, file_type, new_data, tag=None):
+    """
+    Updates the list of post ids (in the file ids/post_ids.json) with the ids of the new posts.
+    """
    try:
        status = file_methods.check_existence(file_path, file_type)
        if not tag:
@@ -94,6 +111,9 @@ def update_posts(file_path, file_type, new_data, tag=None):


 def update_videos(settings, new_data, tag):
+    """
+    Updates the list of video ids (in the file ids/video_ids.json) with the ids of the new videos.
+    """
    file_path = settings["video_ids"]
    file_methods.check_file(file_path, "file")
    log = file_methods.id_writer(file_path, new_data, tag, True)
@@ -102,6 +122,9 @@ def update_videos(settings, new_data, tag):


 def get_total_posts(file_path, tag):
+    """
+    Returns total count of ids in a id list along with the number of unique ids among them.
+    """
    status = file_methods.check_existence(file_path, "file")
    if not status:
        raise OSError("{file_path} not found!")
@@ -114,6 +137,9 @@ def get_total_posts(file_path, tag):


 def print_total(file_path, tag, data_type):
+    """
+    Prints the total count for posts or videos for a hashtag. Calls the function get_total_posts for sanity check that there are no repeating ids in the id lists.
+    """
    total = get_total_posts(file_path, tag)
    if (total.total == total.unique):
        print(f"Total {data_type} for the hashtag {tag} are: {total.total}")
@@ -121,5 +147,3 @@ def print_total(file_path, tag, data_type):
    else:
        print(f"WARNING: out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong...")
        return
-
-
--- a/tiktok_downloader/file_methods.py
+++ b/tiktok_downloader/file_methods.py
@@ -4,7 +4,15 @@ import global_data
 import data_methods


+"""
+The file contains the functions that operate on files, such as writing or reading from files etc.
+"""
+
+
 def create_file(name, file_type):
+    """
+    Creates a file or directory.
+    """
    if (file_type == "dir"):
        os.makedirs(name, mode=0o777)
    elif (file_type == "file"):
@@ -15,6 +23,9 @@ def create_file(name, file_type):


 def check_existence(file_path, file_type):
+    """
+    Checks the existence of a file or a directory. If not found, returns a False, else returns a true.
+    """
    if (file_type == "file"):
        if os.path.isfile(file_path):
            return True
@@ -30,24 +41,32 @@ def check_existence(file_path, file_type):


 def check_file(file_path, file_type):
+    """
+    Creates a file or directory, if not found. Else, returns nothing.
+    """
    status = check_existence(file_path, file_type)
    if not status:
-        create_file(file_path, file_type)    
+        create_file(file_path, file_type)

    return


 def download_posts(settings, tag):
+    """
+    Runs the tiktok-scraper command to download posts for a given hashtag.
+    Returns the path to the downloaded file of posts. If no file was downloaded, prints the error and returns nothing in order to move on.
+    os.chdir is used to execute shell commands in the right folders and then reused to come back to the original folder of execution of run_downloader script.
+    """
    path = os.path.join(settings["data"], tag, settings["posts"])
    os.chdir(path)
    try:
-        tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json'" 
+        tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json'"
        result = subprocess.run([tiktok_command], capture_output=True, shell=True)
        if result.stdout:
            new_file = result.stdout.decode('utf-8').split()[-1]
            if ("json" in new_file):
                os.chdir("../../../tiktok_downloader")
-                return new_file 
+                return new_file
            else:
                print(f"ERROR: Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file!!!!")
                os.chdir("../../../tiktok_downloader")
@@ -61,11 +80,16 @@ def download_posts(settings, tag):


 def download_videos(settings, tag):
+    """
+    Runs the tiktok-scraper command to download videos for a given hashtag. Note that all the videos are downloaded that are returned by the tiktok api and as a result, its a time and data consuming process.
+    The list of downloaded video ids is constucted and returned if the downloaded folder contains at least 1 video.
+    os.chdir is used to execute shell commands in the right folders and then reused to come back to the original folder of execution of run_downloader script.
+    """
    path = os.path.join(settings["data"], tag, settings["videos"])
    os.chdir(path)
    try:
-        # tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d" 
-        tiktok_command = f"tiktok-scraper hashtag {tag} -d" 
+        # tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d"
+        tiktok_command = f"tiktok-scraper hashtag {tag} -d"
        result = subprocess.run([tiktok_command], capture_output=True, shell=True)
        if result.stdout:
            downloaded_list_tmp = os.listdir(f"./#{tag}")
@@ -74,7 +98,7 @@ def download_videos(settings, tag):
                for file in downloaded_list_tmp:
                    file = file[0:-4]
                    downloaded_list.append(file)
-                
+
                os.chdir("../../../tiktok_downloader")
                return downloaded_list
            else:
@@ -85,22 +109,33 @@ def download_videos(settings, tag):
            os.chdir("../../../tiktok_downloader")
            print(f"WARNING: Something went wrong with the tiktok-scraper video download for the {tag} !!!!")
            return
-        
+
    except: raise


 def get_data(file_path):
+    """
+    Reads the json file and retuns the read data.
+    """
    with open(file_path, "r") as f:
        data = json.load(f)
        return data


 def dump_data(file_path, data):
+    """
+    Writes the data to the json file.
+    """
    with open(file_path, "w") as f:
        json.dump(data, f)
-        return            
+        return

 def log_writer(log_data):
+    """
+    Creates the dictionary of total downloads (posts and videos) per hashtag.
+    Example : {timstamp : {hashtag : { videos : number_of_new_videos , posts : number_of_new_posts } } }
+    Writes the dictionary to the log file (logs/log.json).
+    """
    total = 0
    try:
        log_dict = {}
@@ -132,6 +167,9 @@ def log_writer(log_data):


 def id_writer(file_path, new_data, tag, status):
+    """
+    Writes the list of new ids to the post_ids or video_ds files.
+    """
    try:
        total = len(new_data)
        if status:
@@ -140,7 +178,7 @@ def id_writer(file_path, new_data, tag, status):
                if tag in data:
                    data[tag] += new_data
                else:
-                    data[tag]= new_data 
+                    data[tag]= new_data
                dump_data(file_path, data)
            except json.decoder.JSONDecodeError:
                data = { tag : new_data }
@@ -155,6 +193,9 @@ def id_writer(file_path, new_data, tag, status):


 def post_writer(file_path, new_data, status):
+    """
+    Writes the new posts in the post file of the given hashtag (/data/{hashtag}/posts/data.json)
+    """
    try:
        total = len(new_data)
        if status:
@@ -174,6 +215,9 @@ def post_writer(file_path, new_data, status):


 def delete_file(file_path, file_type):
+    """
+    Deletes the directory or the file.
+    """
    if not check_existence(file_path, file_type):
        print(f"ERROR: Attempt to delete failed. {file_path} does not exist!!!")
    elif (file_type == "file"):
@@ -190,12 +234,16 @@ def delete_file(file_path, file_type):


 def clean_video_files(settings, tag, new_data=None):
+    """
+    Moves the new videos from the tiktok-scraper video folder to /data/{hashtag}/videos/
+    Deletes the residual tiktok-scraper video folder.
+    """
    try:
        if new_data:
            for file in new_data:
                settings["videos_from"] = settings['data'] + f"/{tag}/videos/#{tag}/{file}.mp4"
                subprocess.call(f"mv {settings['videos_from']} {settings['videos_to']}", shell=True)
-             
+
        subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True)
        print(f"Successfully deleted the folder {settings['videos_delete']} folder of videos.")
    except:
--- a/tiktok_downloader/global_data.py
+++ b/tiktok_downloader/global_data.py
@@ -1,3 +1,8 @@
+"""
+Contains global constants relating to paths and operational parameters such as sleep time between consecutive tiktok-scraper calls.
+"""
+
+
 # Directories
 DATA = "../data"
 IDS = "ids"
@@ -37,4 +42,3 @@ PARAMETERS = {
 #            "number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper.
            "sleep" : 8
        }
-
--- a/tiktok_downloader/run_downloader.py
+++ b/tiktok_downloader/run_downloader.py
@@ -8,6 +8,39 @@ import file_methods
 import data_methods


+"""
+The run_downloader.py dowloads data using the tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper).
+1. "-p" option is used by the user to download posts only
+2. "-v" option is use to download videos only
+3. "-p -v" is used to download posts and videos
+4. "--h" is used to specify a list of hashtags as arguments
+5. "-f" option is used to read the list of hashtags from the user specified file
+
+Example:
+    1. The command "python3 run_downloader.py --h london paris newyork -p" will download posts for hashtags london, paris and newyork.
+    2. The command "python3 run_downloader.py -f hashtag_list -p -v" will download posts and videos for hashtags in the file hashtag_list.
+
+
+The downloaded data is stored in the the data folder. The data is folder is organized as follows:
+    1. the log subfolder contains the log.json that records total downloads (posts and videos) for each hashtag with a timestamp of when the script was run.
+    2. the ids subfolder contains post_ids.json and video_ids.json that keep the record of post and video ids that are currently in the data set. This helps to filter out only new posts every time tiktok-scraper is run and only those new posts (or videos) are then stored in the data folder.
+    3. Each hashtag has a subfolder by its name containing two subfolders, one each for posts and videos.
+
+
+This scripts runs the function get_data in main which in turn triggers the following sequence:
+    1. get_posts function is triggered if the user wants to download posts
+    2. get_videos function is triggered if the user wants to download videos
+    3. both functions above are sequentially triggered if the user wants to download both posts and videos.
+    4. After the data is downloaded the log_writer is triggered to log the total number of posts and videos downloaded.
+
+
+------------Files--------------
+global_data - contains global constants relating to paths etc.
+data_methods - this file contains data processing methods
+file_methods - this file contains methods to write and update data in files
+hashtag_list - this file contains the list of hashtags that the user wants to download data for.
+"""
+

 command = "python3 post_downloader.py "

@@ -37,6 +70,10 @@ def create_parser():


 def set_download_settings(download_data_type):
+    """
+    Loads the constants from global_data into the dict called settings and returns it.
+    Purpose - easy access to global constants by various functions.
+    """
    settings = {}
    settings["data"] = global_data.FILES["data"]
    settings["ids"] = global_data.FILES["ids"]
@@ -54,7 +91,6 @@ def set_download_settings(download_data_type):
    elif download_data_type == "videos":
        settings["videos"] = global_data.FILES["videos"]
        settings["video_ids"] = global_data.FILES["video_ids"]
-        settings["number_of_videos"] = global_data.PARAMETERS["number_of_videos"]
        return settings
    elif download_data_type == "posts-videos":
        settings["posts"] = global_data.FILES["posts"]
@@ -62,7 +98,6 @@ def set_download_settings(download_data_type):
        settings["data_file"] = global_data.FILES["data_file"]
        settings["videos"] = global_data.FILES["videos"]
        settings["video_ids"] = global_data.FILES["video_ids"]
-        settings["number_of_videos"] = global_data.PARAMETERS["number_of_videos"]
        return settings
    else:
        print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.")
@@ -71,6 +106,11 @@ def set_download_settings(download_data_type):


 def get_posts(settings, tag):
+    """
+    1. calls download_posts in file_methods.py to get the posts for a given hashtag
+    2. calls extract_posts from data_methods.py to extract new posts if any
+    3. calls update_posts from data_methods.py to update the id-list with the ids of newly downloaded posts.
+    """
    file_path = file_methods.download_posts(settings, tag)
    log = ()
    if file_path:
@@ -80,12 +120,18 @@ def get_posts(settings, tag):
            data_methods.update_posts(data_file, "file", new_data[1])
            log = data_methods.update_posts(settings["post_ids"], "file", new_data[0], tag)
        file_methods.delete_file(file_path, "file")
-    
+
    return log



-def get_videos(settings, tag):    
+def get_videos(settings, tag):
+    """
+    1. calls download_videos in file_methods.py to get the videos for a given hashtag
+    2. calls extract_videos from data_methods.py to extract new videos if any
+    3. calls update_videos from data_methods.py to update the id-list with the ids of newly downloaded videos.
+    4. the clean_video_files function deletes the residual video folder after the data processing
+    """
    log = ()
    download_list = file_methods.download_videos(settings, tag)
    if download_list:
@@ -100,11 +146,15 @@ def get_videos(settings, tag):


 def get_data(hashtags, download_data_type):
+    """
+    The function checks for the user option "-p", "-v" or both and then
+    triggers the functions get_posts, get_videos or both, respectively.
+    """
    counter = 0
    total_hashtags = len(hashtags)
    total_hashtags_offset = total_hashtags - 1
    log_data = []
-    
+
    if download_data_type == "posts":
        settings = set_download_settings(download_data_type)
        while counter < total_hashtags:
@@ -116,7 +166,7 @@ def get_data(hashtags, download_data_type):
                log = ( res[0], ( "posts", res[1] ) )
                log_data.append(log)
                data_methods.print_total(settings["post_ids"], tag, download_data_type)
-            
+
            counter += 1
            if counter < total_hashtags_offset:
                time.sleep(settings["sleep"])
@@ -132,7 +182,7 @@ def get_data(hashtags, download_data_type):
                res = ( res[0], ( "videos", res[1]))
                log_data.append(res)
                data_methods.print_total(settings["video_ids"], tag, download_data_type)
- 
+
            counter += 1
            if counter < total_hashtags_offset:
                time.sleep(settings["sleep"])
@@ -154,7 +204,7 @@ def get_data(hashtags, download_data_type):
                    res = ( res[0], (req[0], res[1]) )
                    log_data.append(res)
                    data_methods.print_total(settings[req[1]], tag, req[0])
-                
+
                if req_counter < total_reqs_offset:
                    time.sleep(settings["sleep"])
                    req_counter += 1
@@ -169,6 +219,9 @@ def get_data(hashtags, download_data_type):


 def get_hashtags(file_name, hashtag_list):
+    """
+    Loads and returns the list of hashtags from user specified file.
+    """
    try:
        from hashtag_list import hashtag_list
        return hashtag_list
@@ -184,7 +237,7 @@ if __name__ == "__main__":
    if not (args.h or args.f):
        parser.error("No hashtags were given, please use either --h option or -f to provide hashtags.")
        sys.exit()
-    
+
    if not (args.p or args.v):
        parser.error("No argument given, please specify either -p for posts or -v videos or both.")
        sys.exit()
@@ -206,8 +259,8 @@ if __name__ == "__main__":
        download_data_type = "posts"
    else:
        download_data_type = "videos"
-   
-    try: 
+
+    try:
        log_data = get_data(hashtags, download_data_type)
        if log_data:
            file_methods.log_writer(log_data)