Files
tiktok-hashtag-analysis/analytics/hashtag_frequencies.py
2022-05-02 23:55:01 +02:00

124 lines
4.0 KiB
Python

import os, sys
import csv, json
import argparse
import matplotlib.pyplot as plt
from datetime import datetime
"""
Plots the frequency of hashtags appearing in the set of given posts.
"""
sys.path.insert(0, '../tiktok_downloader')
import file_methods, global_data
def get_hashtags(obj):
if not obj:
print(f'ERROR: Empty item, no hashtags to be extracted.')
return
else:
hashtags = {}
tags = [ [tag['name'] for tag in ele['hashtags']] for ele in obj ]
tags = [ set(ele) for ele in tags ]
{ tag: (1 if tag not in hashtags and not hashtags.update({tag: 1})
else hashtags[tag] + 1 and not hashtags.update({tag: hashtags[tag] + 1}))
for ele in tags for tag in ele }
hashtags = sorted(hashtags.items(), key=lambda e: e[1], reverse=True)
return hashtags
def get_occurrences(filename, n=1 , sort=True):
"""
Takes the json file containing posts and returns a dictionary:
local variable occs = {
"total": total posts in the file,
top_n: [[top n hashtags ], [frequencies of corresponding hashtags]]
}
"""
with open(filename) as f:
obj = json.load(f)
l = len(obj)
tags = get_hashtags(obj)
occs = {
"total": l,
"top_n": []
}
occs["top_n"] = [ [ ele[i] for ele in tags[0:n] ] for i in range(2)]
return occs
def plot(n, occs, img_folder):
plt.scatter(occs["top_n"][0], occs["top_n"][1])
plt.tight_layout()
plt.xticks(rotation=45)
plt.title(f'Hashtag Distribution')
plt.xlabel(f'Top {n} hashtags from {occs["total"]} posts.')
plt.ylabel(f'Number of occurrences')
save_plot(img_folder)
plt.show(block=None)
return
def print_occurrences(occs):
"""
Prints the top n hashtags with their frequencies and the ratio of occurrences and total posts, all to the shell.
"""
row_number = 0
total_posts = occs["total"]
print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts(total_posts))'))
for key,value in zip(occs["top_n"][0], occs["top_n"][1]):
ratio = value/total_posts
print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio))
row_number += 1
return
def save_plot(img_folder):
"""
Saves the plot to a png file in the folder /data/imgs/
"""
try:
now = datetime.now()
current_time = now.strftime("%Y_%m_%d_%H_%M_%S")
plt.savefig(f"{img_folder}/{current_time}.png")
return
except: raise
if __name__ == "__main__":
"""
Option "n" specifies how many hashtags does the user wants to plot.
"-d" option prints the hashtag frequencies on the shell
"-p" option plots the hashtag frequencies and saves as a png file in the folder /data/imgs/
The function get_occurances is triggered to compute and return the top n occurances and the hashtags.
"""
img_folder = global_data.IMAGES
file_methods.check_file(img_folder, "dir")
parser = argparse.ArgumentParser()
parser.add_argument("input_file", help="The json hashtag file name")
parser.add_argument("n", help="The number of top n occurrences", type=int)
parser.add_argument("-p", "--plot", help="Plot the occurrences", action="store_true")
parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true")
args = parser.parse_args()
if args.input_file and args.n:
if args.n < 1:
print(f"Please make sure the number of top occurrences is a positive integer.")
sys.exit()
base = os.path.splitext(args.input_file)[0]
path = f"./{base}_sorted_hashtags.csv"
if args.plot:
occs = get_occurrences(args.input_file, args.n)
plot(args.n, occs, img_folder)
else:
occs = get_occurrences(args.input_file, args.n)
print_occurrences(occs)
else:
print(f'ERROR: either {args.input_file} or {args.n} or both contains error.')