From 618f53e2618aa09417e0d7db48cf02b2378f673f Mon Sep 17 00:00:00 2001 From: johannawild <72805812+johannawild@users.noreply.github.com> Date: Mon, 2 May 2022 23:55:01 +0200 Subject: [PATCH] Update hashtag_frequencies.py --- analytics/hashtag_frequencies.py | 75 ++++++++++++++------------------ 1 file changed, 33 insertions(+), 42 deletions(-) diff --git a/analytics/hashtag_frequencies.py b/analytics/hashtag_frequencies.py index 0bd99ff..2ded3d1 100644 --- a/analytics/hashtag_frequencies.py +++ b/analytics/hashtag_frequencies.py @@ -20,72 +20,63 @@ def get_hashtags(obj): return else: hashtags = {} - l = len(obj) - for i in range(l): - for hashtag in obj[i]['hashtags']: - if hashtag['name'] in hashtags: - hashtags[hashtag['name']].add(i) - else: - hashtags[hashtag['name']] = {i} - return hashtags + tags = [ [tag['name'] for tag in ele['hashtags']] for ele in obj ] + tags = [ set(ele) for ele in tags ] + { tag: (1 if tag not in hashtags and not hashtags.update({tag: 1}) + else hashtags[tag] + 1 and not hashtags.update({tag: hashtags[tag] + 1})) + for ele in tags for tag in ele } + hashtags = sorted(hashtags.items(), key=lambda e: e[1], reverse=True) + + return hashtags def get_occurrences(filename, n=1 , sort=True): """ - Takes the json file containing posts and returns the triplet: - l : total posts in the file - k : list of top n hashtags - v_total : frequency of top n hashtags in l + Takes the json file containing posts and returns a dictionary: + local variable occs = { + "total": total posts in the file, + top_n: [[top n hashtags ], [frequencies of corresponding hashtags]] + } """ with open(filename) as f: obj = json.load(f) l = len(obj) tags = get_hashtags(obj) - tags = {key: (len(value), value) for (key, value) in tags.items()} - if not sort: - k = list(tags.keys()) - v = list(tags.values()) - return obj, k, v - else: - sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)} - k = list(sorted_tags.keys()) - v = list(sorted_tags.values()) - k = k[:n] - v_total = [i[0] for i in v] - v_total = v_total[:n] - return l, k, v_total + occs = { + "total": l, + "top_n": [] + } + occs["top_n"] = [ [ ele[i] for ele in tags[0:n] ] for i in range(2)] + return occs - -def plot(n, length, k, v, img_folder): - plt.scatter(k, v) +def plot(n, occs, img_folder): + plt.scatter(occs["top_n"][0], occs["top_n"][1]) plt.tight_layout() plt.xticks(rotation=45) plt.title(f'Hashtag Distribution') - plt.xlabel(f'Top {n} hashtags from {length} posts.') + plt.xlabel(f'Top {n} hashtags from {occs["total"]} posts.') plt.ylabel(f'Number of occurrences') - save_plot(plt, img_folder) + save_plot(img_folder) plt.show(block=None) return -def print_occurrences(l, k, v): +def print_occurrences(occs): """ Prints the top n hashtags with their frequencies and the ratio of occurrences and total posts, all to the shell. """ row_number = 0 - total_posts = l - print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts({l}))')) - #print(f'Hashtag Occurrences Frequency(Occurances/Total-Posts)') - for key,value in zip(k, v): - ratio = value/total_posts + total_posts = occs["total"] + print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts(total_posts))')) + for key,value in zip(occs["top_n"][0], occs["top_n"][1]): + ratio = value/total_posts print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio)) - #print(f'{row_number}\t{key}\t\t{value}\t\t{ratio:.3f}') row_number += 1 return -def save_plot(plt, img_folder): +def save_plot(img_folder): """ Saves the plot to a png file in the folder /data/imgs/ """ @@ -123,10 +114,10 @@ if __name__ == "__main__": base = os.path.splitext(args.input_file)[0] path = f"./{base}_sorted_hashtags.csv" if args.plot: - length, keys, values = get_occurrences(args.input_file, args.n) - plot(args.n, length, keys, values, img_folder) + occs = get_occurrences(args.input_file, args.n) + plot(args.n, occs, img_folder) else: - length, keys, values = get_occurrences(args.input_file, args.n) - print_occurrences(length, keys, values) + occs = get_occurrences(args.input_file, args.n) + print_occurrences(occs) else: print(f'ERROR: either {args.input_file} or {args.n} or both contains error.')