mirror of
https://github.com/bellingcat/tiktok-hashtag-analysis.git
synced 2026-06-11 12:58:30 +03:00
124 lines
4.0 KiB
Python
124 lines
4.0 KiB
Python
import os, sys
|
|
import csv, json
|
|
import argparse
|
|
import matplotlib.pyplot as plt
|
|
from datetime import datetime
|
|
|
|
"""
|
|
Plots the frequency of hashtags appearing in the set of given posts.
|
|
"""
|
|
|
|
|
|
sys.path.insert(0, '../tiktok_downloader')
|
|
import file_methods, global_data
|
|
|
|
|
|
|
|
def get_hashtags(obj):
|
|
if not obj:
|
|
print(f'ERROR: Empty item, no hashtags to be extracted.')
|
|
return
|
|
else:
|
|
hashtags = {}
|
|
tags = [ [tag['name'] for tag in ele['hashtags']] for ele in obj ]
|
|
tags = [ set(ele) for ele in tags ]
|
|
{ tag: (1 if tag not in hashtags and not hashtags.update({tag: 1})
|
|
else hashtags[tag] + 1 and not hashtags.update({tag: hashtags[tag] + 1}))
|
|
for ele in tags for tag in ele }
|
|
hashtags = sorted(hashtags.items(), key=lambda e: e[1], reverse=True)
|
|
|
|
return hashtags
|
|
|
|
|
|
def get_occurrences(filename, n=1 , sort=True):
|
|
"""
|
|
Takes the json file containing posts and returns a dictionary:
|
|
local variable occs = {
|
|
"total": total posts in the file,
|
|
top_n: [[top n hashtags ], [frequencies of corresponding hashtags]]
|
|
}
|
|
"""
|
|
with open(filename) as f:
|
|
obj = json.load(f)
|
|
l = len(obj)
|
|
tags = get_hashtags(obj)
|
|
occs = {
|
|
"total": l,
|
|
"top_n": []
|
|
}
|
|
occs["top_n"] = [ [ ele[i] for ele in tags[0:n] ] for i in range(2)]
|
|
return occs
|
|
|
|
|
|
def plot(n, occs, img_folder):
|
|
plt.scatter(occs["top_n"][0], occs["top_n"][1])
|
|
plt.tight_layout()
|
|
plt.xticks(rotation=45)
|
|
plt.title(f'Hashtag Distribution')
|
|
plt.xlabel(f'Top {n} hashtags from {occs["total"]} posts.')
|
|
plt.ylabel(f'Number of occurrences')
|
|
save_plot(img_folder)
|
|
plt.show(block=None)
|
|
return
|
|
|
|
|
|
def print_occurrences(occs):
|
|
"""
|
|
Prints the top n hashtags with their frequencies and the ratio of occurrences and total posts, all to the shell.
|
|
"""
|
|
row_number = 0
|
|
total_posts = occs["total"]
|
|
print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts(total_posts))'))
|
|
for key,value in zip(occs["top_n"][0], occs["top_n"][1]):
|
|
ratio = value/total_posts
|
|
print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio))
|
|
row_number += 1
|
|
return
|
|
|
|
|
|
def save_plot(img_folder):
|
|
"""
|
|
Saves the plot to a png file in the folder /data/imgs/
|
|
"""
|
|
try:
|
|
now = datetime.now()
|
|
current_time = now.strftime("%Y_%m_%d_%H_%M_%S")
|
|
plt.savefig(f"{img_folder}/{current_time}.png")
|
|
|
|
return
|
|
except: raise
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
"""
|
|
Option "n" specifies how many hashtags does the user wants to plot.
|
|
"-d" option prints the hashtag frequencies on the shell
|
|
"-p" option plots the hashtag frequencies and saves as a png file in the folder /data/imgs/
|
|
|
|
The function get_occurances is triggered to compute and return the top n occurances and the hashtags.
|
|
"""
|
|
img_folder = global_data.IMAGES
|
|
file_methods.check_file(img_folder, "dir")
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("input_file", help="The json hashtag file name")
|
|
parser.add_argument("n", help="The number of top n occurrences", type=int)
|
|
parser.add_argument("-p", "--plot", help="Plot the occurrences", action="store_true")
|
|
parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true")
|
|
args = parser.parse_args()
|
|
if args.input_file and args.n:
|
|
if args.n < 1:
|
|
print(f"Please make sure the number of top occurrences is a positive integer.")
|
|
sys.exit()
|
|
|
|
base = os.path.splitext(args.input_file)[0]
|
|
path = f"./{base}_sorted_hashtags.csv"
|
|
if args.plot:
|
|
occs = get_occurrences(args.input_file, args.n)
|
|
plot(args.n, occs, img_folder)
|
|
else:
|
|
occs = get_occurrences(args.input_file, args.n)
|
|
print_occurrences(occs)
|
|
else:
|
|
print(f'ERROR: either {args.input_file} or {args.n} or both contains error.')
|