tiktok-hashtag-analysis/analytics/hashtag_frequencies.py

import os, sys
import csv, json
import argparse
import matplotlib.pyplot as plt
from datetime import datetime

"""
Plots the frequency of hashtags appearing in the set of given posts.
"""


sys.path.insert(0, '../tiktok_downloader')
import file_methods, global_data


def get_hashtags(obj):
    if not obj:
        print(f'ERROR: Empty item, no hashtags to be extracted.')
        return
    else:
        hashtags = {}
        tags = [ [tag['name'] for tag in ele['hashtags']] for ele in obj ]
        tags = [ set(ele) for ele in tags ]
        { tag: (1 if tag not in hashtags and not hashtags.update({tag: 1})
            else hashtags[tag] + 1 and not hashtags.update({tag: hashtags[tag] + 1}))
            for ele in tags for tag in ele }
        hashtags = sorted(hashtags.items(), key=lambda e: e[1], reverse=True)

        return hashtags


def get_occurrences(filename, n=1 , sort=True):
    """
    Takes the json file containing posts and returns a dictionary:
    local variable occs = {
        "total": total posts in the file,
        top_n: [[top n hashtags ], [frequencies of corresponding hashtags]]
    }
    """
    with open(filename) as f:
        obj = json.load(f)
        l = len(obj)
        tags = get_hashtags(obj)
        occs = {
                "total": l,
                "top_n": []
                }
        occs["top_n"] = [ [ ele[i] for ele in tags[0:n] ] for i in range(2)]
        return occs


def plot(n, occs, img_folder):
    plt.scatter(occs["top_n"][0], occs["top_n"][1])
    plt.tight_layout()
    plt.xticks(rotation=45)
    plt.title(f'Hashtag Distribution')
    plt.xlabel(f'Top {n} hashtags from {occs["total"]} posts.')
    plt.ylabel(f'Number of occurrences')
    save_plot(img_folder)
    plt.show(block=None)
    return


def print_occurrences(occs):
    """
    Prints the top n hashtags with their frequencies and the ratio of occurrences and total posts, all to the shell.
    """
    row_number = 0
    total_posts = occs["total"]
    print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts(total_posts))'))
    for key,value in zip(occs["top_n"][0], occs["top_n"][1]):
        ratio = value/total_posts
        print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio))
        row_number += 1
    return


def save_plot(img_folder):
    """
    Saves the plot to a png file in the folder /data/imgs/
    """
    try:
        now = datetime.now()
        current_time = now.strftime("%Y_%m_%d_%H_%M_%S")
        plt.savefig(f"{img_folder}/{current_time}.png")

        return
    except: raise


if __name__ == "__main__":
    """
    Option "n" specifies how many hashtags does the user wants to plot.
    "-d" option prints the hashtag frequencies on the shell
    "-p" option plots the hashtag frequencies and saves as a png file in the folder /data/imgs/

    The function get_occurances is triggered to compute and return the top n occurances and the hashtags.
    """
    img_folder = global_data.IMAGES
    file_methods.check_file(img_folder, "dir")
    parser = argparse.ArgumentParser()
    parser.add_argument("input_file", help="The json hashtag file name")
    parser.add_argument("n", help="The number of top n occurrences", type=int)
    parser.add_argument("-p", "--plot", help="Plot the occurrences", action="store_true")
    parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true")
    args = parser.parse_args()
    if args.input_file and args.n:
        if args.n < 1:
            print(f"Please make sure the number of top occurrences is a positive integer.")
            sys.exit()

        base = os.path.splitext(args.input_file)[0]
        path = f"./{base}_sorted_hashtags.csv"
        if args.plot:
            occs = get_occurrences(args.input_file, args.n)
            plot(args.n, occs, img_folder)
        else:
            occs = get_occurrences(args.input_file, args.n)
            print_occurrences(occs)
    else:
        print(f'ERROR: either {args.input_file} or {args.n} or both contains error.')