Files
tiktok-hashtag-analysis/analytics/hashtag_frequencies.py
2022-02-25 16:55:02 +01:00

133 lines
4.3 KiB
Python

import os, sys
import csv, json
import argparse
import matplotlib.pyplot as plt
from datetime import datetime
"""
Plots the frequency of hashtags appearing in the set of given posts.
"""
sys.path.insert(0, '../tiktok_downloader')
import file_methods, global_data
def get_hashtags(obj):
if not obj:
print(f'ERROR: Empty item, no hashtags to be extracted.')
return
else:
hashtags = {}
l = len(obj)
for i in range(l):
for hashtag in obj[i]['hashtags']:
if hashtag['name'] in hashtags:
hashtags[hashtag['name']].add(i)
else:
hashtags[hashtag['name']] = {i}
return hashtags
def get_occurrences(filename, n=1 , sort=True):
"""
Takes the json file containing posts and returns the triplet:
l : total posts in the file
k : list of top n hashtags
v_total : frequency of top n hashtags in l
"""
with open(filename) as f:
obj = json.load(f)
l = len(obj)
tags = get_hashtags(obj)
tags = {key: (len(value), value) for (key, value) in tags.items()}
if not sort:
k = list(tags.keys())
v = list(tags.values())
return obj, k, v
else:
sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)}
k = list(sorted_tags.keys())
v = list(sorted_tags.values())
k = k[:n]
v_total = [i[0] for i in v]
v_total = v_total[:n]
return l, k, v_total
def plot(n, length, k, v, img_folder):
plt.scatter(k, v)
plt.tight_layout()
plt.xticks(rotation=45)
plt.title(f'Hashtag Distribution')
plt.xlabel(f'Top {n} hashtags from {length} posts.')
plt.ylabel(f'Number of occurrences')
save_plot(plt, img_folder)
plt.show(block=None)
return
def print_occurrences(l, k, v):
"""
Prints the top n hashtags with their frequencies and the ratio of occurrences and total posts, all to the shell.
"""
row_number = 0
total_posts = l
print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts({l}))'))
#print(f'Hashtag Occurrences Frequency(Occurances/Total-Posts)')
for key,value in zip(k, v):
ratio = value/total_posts
print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio))
#print(f'{row_number}\t{key}\t\t{value}\t\t{ratio:.3f}')
row_number += 1
return
def save_plot(plt, img_folder):
"""
Saves the plot to a png file in the folder /data/imgs/
"""
try:
now = datetime.now()
current_time = now.strftime("%Y_%m_%d_%H_%M_%S")
plt.savefig(f"{img_folder}/{current_time}.png")
return
except: raise
if __name__ == "__main__":
"""
Option "n" specifies how many hashtags does the user wants to plot.
"-d" option prints the hashtag frequencies on the shell
"-p" option plots the hashtag frequencies and saves as a png file in the folder /data/imgs/
The function get_occurances is triggered to compute and return the top n occurances and the hashtags.
"""
img_folder = global_data.IMAGES
file_methods.check_file(img_folder, "dir")
parser = argparse.ArgumentParser()
parser.add_argument("input_file", help="The json hashtag file name")
parser.add_argument("n", help="The number of top n occurrences", type=int)
parser.add_argument("-p", "--plot", help="Plot the occurrences", action="store_true")
parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true")
args = parser.parse_args()
if args.input_file and args.n:
if args.n < 1:
print(f"Please make sure the number of top occurrences is a positive integer.")
sys.exit()
base = os.path.splitext(args.input_file)[0]
path = f"./{base}_sorted_hashtags.csv"
if args.plot:
length, keys, values = get_occurrences(args.input_file, args.n)
plot(args.n, length, keys, values, img_folder)
else:
length, keys, values = get_occurrences(args.input_file, args.n)
print_occurrences(length, keys, values)
else:
print(f'ERROR: either {args.input_file} or {args.n} or both contains error.')