Files
tiktok-hashtag-analysis/extract_hashtag.py
2021-08-10 19:06:23 +02:00

72 lines
2.2 KiB
Python

import os, sys
import csv, json
import matplotlib.pyplot as plt
from collections import Counter, OrderedDict
def get_hashtag_list(obj):
if not obj:
print(f'ERROR: Empty item, no hashtags to be extracted.')
return
else:
hashtag_list = []
length = len(obj)
for i in range(length):
for hashtag in obj[i]['hashtags']:
hashtag_list.append(hashtag['name'])
return hashtag_list
def create_csv(file_name, d):
base = os.path.splitext(file_name)[0]
path = f"./{base}_sorted_hashtags.csv"
if os.path.exists(path):
print(f'The file {path} containing hashtag occurances already exists.')
return None
else:
with open(path, "w") as f:
f.write(f"Name, Occurances" + "\n")
for key,value in d.items():
f.write(f"{key}, {value}" + "\n")
print(f'The sorted hashtag occcurances list is contained in the file {path}.')
return None
def plot_hashtag_occurances(file_name, plots):
with open(file_name) as f:
obj = json.load(f)
length = len(obj)
hashtag_list = get_hashtag_list(obj)
hashtags = Counter(hashtag_list).most_common()
hashtags_sorted = {k:v for (k,v) in hashtags}
create_csv(file_name, hashtags_sorted)
k = list(hashtags_sorted.keys())
v = list(hashtags_sorted.values())
k = k[:plots]
v = v[:plots]
plt.scatter(k, v)
plt.tight_layout()
plt.title(f'Hashtag Distribution')
plt.xlabel(f'Top {plots} hashtags from {length} posts.')
plt.ylabel(f'Number of occurances')
plt.show()
return
if len(sys.argv) != 3:
print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.')
sys.exit()
else:
try:
int(sys.argv[2])
except:
print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.')
raise
try:
plot_hashtag_occurances(sys.argv[1], int(sys.argv[2]))
except:
print("Unexpected error:", sys.exc_info()[0])
raise