Add files via upload

2026-06-12 21:38:30 +03:00 · 2021-08-11 18:34:46 +02:00
parent db74373c3d
commit 29e87b75c3
2 changed files with 121 additions and 53 deletions
--- a/extract_hashtag.py
+++ b/extract_hashtag.py
@@ -1,71 +1,75 @@
 import os, sys
 import csv, json
 import matplotlib.pyplot as plt
-from collections import Counter, OrderedDict


-def get_hashtag_list(obj):
+
+def get_hashtags(obj):
    if not obj:
        print(f'ERROR: Empty item, no hashtags to be extracted.')
        return
    else:
-        hashtag_list = []
-        length = len(obj)
-        for i in range(length):
+        hashtags = {}
+        l = len(obj)
+        for i in range(l):
            for hashtag in obj[i]['hashtags']:
-                hashtag_list.append(hashtag['name'])
-    return hashtag_list
+                if hashtag['name'] in hashtags:
+                    hashtags[hashtag['name']].add(i)
+                else:
+                    hashtags[hashtag['name']] = {i}
+    return hashtags


-def create_csv(file_name, d):
+def create_csv(file_name, path, d):
+    with open(path, "w") as f:
+        f.write(f"Name, Occurances, Positions" + "\n")
+        for key,value in d.items():
+            f.write(f"{key}, {value[0]}, " + f"{value[1]}".replace(",", ";") + "\n")
+    print(f'The sorted hashtag occcurances list is contained in the file {path}.')
+    return None
+
+
+def plot_occurances(file_name, plots):
    base = os.path.splitext(file_name)[0]
    path = f"./{base}_sorted_hashtags.csv"
    if os.path.exists(path):
-        print(f'The file {path} containing hashtag occurances already exists.')
-        return None
+        print(f'The file {path} containing hashtag occurances already exists. If you would like to generate a plot, please delete the file {path} and re-run the script.')
+        return 
    else:
-        with open(path, "w") as f:
-            f.write(f"Name, Occurances" + "\n")
-            for key,value in d.items():
-                f.write(f"{key}, {value}" + "\n")
-        print(f'The sorted hashtag occcurances list is contained in the file {path}.')
-        return None
+        with open(file_name) as f:
+            obj = json.load(f)
+            l = len(obj)
+            tags = get_hashtags(obj)
+            tags = {key: (len(value), value) for (key, value) in tags.items()}
+            sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)}
+            create_csv(file_name, path, sorted_tags)
+            k = list(sorted_tags.keys())
+            v = list(sorted_tags.values())
+            v = [i[0] for i in v]
+            k = k[:plots]
+            v = v[:plots]
+            plt.scatter(k, v)
+            plt.tight_layout()
+            plt.title(f'Hashtag Distribution')
+            plt.xlabel(f'Top {plots} hashtags from {l} posts.')
+            plt.ylabel(f'Number of occurances')
+            plt.show()
+        return


-def plot_hashtag_occurances(file_name, plots):
-    with open(file_name) as f:
-        obj = json.load(f)
-        length = len(obj)
-        hashtag_list = get_hashtag_list(obj)
-        hashtags = Counter(hashtag_list).most_common()
-        hashtags_sorted = {k:v for (k,v) in hashtags}
-        create_csv(file_name, hashtags_sorted)  
-        k = list(hashtags_sorted.keys())
-        v = list(hashtags_sorted.values()) 
-        k = k[:plots]
-        v = v[:plots]
-        plt.scatter(k, v)
-        plt.tight_layout()
-        plt.title(f'Hashtag Distribution')
-        plt.xlabel(f'Top {plots} hashtags from {length} posts.')
-        plt.ylabel(f'Number of occurances')
-        plt.show()
-    return
-
-
-
-if len(sys.argv) != 3:
-    print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.')
-    sys.exit()
-else:
-    try:
-        int(sys.argv[2])
-    except:
-        print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.')
-        raise
-    
-    try:
-        plot_hashtag_occurances(sys.argv[1], int(sys.argv[2]))
-    except:
-        print("Unexpected error:", sys.exc_info()[0])
-        raise
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.')
+        sys.exit()
+    else:
+        try:
+            int(sys.argv[2])
+        except:
+            print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.')
+            raise
+        
+        try:
+            plot_occurances(sys.argv[1], int(sys.argv[2]))
+        except:
+            print("Unexpected error:", sys.exc_info()[0])
+            raise
--- a/extract_posts.py
+++ b/extract_posts.py
@@ -0,0 +1,64 @@
+import os, sys
+import csv, json
+import re
+from pandas import *
+
+def arg_check():
+    if len(sys.argv) != 3:
+        print(f'ERROR: Please make sure you enter the following in the command line: python3 extract_posts.py file.json hashtag')
+        sys.exit()
+    else:
+        return
+
+def get_hashtag_positions(file_name, hashtag):
+    base = os.path.splitext(file_name)[0]
+    path = f"./{base}_sorted_hashtags.csv"
+    if not os.path.exists(path):
+        print(f'Generating {path} ...')
+        os.system(f'python3 extract_hashtag.py {file_name} {1}')
+
+    return tag_membership(hashtag, path)
+
+
+def tag_membership(hashtag, path):
+    data = read_csv(path)
+    position_str = list(data[data["Name"] == hashtag].values[:, 2])
+    if position_str:
+        position_str = re.split('{|}', str(position_str))[1]
+        p = position_str.replace(";", ",")
+        positions = [int(s) for s in p.split(",")]
+        return positions
+    else:
+        return
+
+
+def print_posts(file_name, path, hashtag, positions):
+    with open(file_name) as f:
+        data = json.load(f)
+        posts = []
+        for p in positions:
+            posts.append(data[p])
+        keys = posts[0].keys()
+        with open(path, 'w', newline='') as csv_file:
+            writer = csv.DictWriter(csv_file, keys)
+            writer.writeheader()
+            writer.writerows(posts)
+    print(f'The posts are contained in the file {path}.')
+    return
+
+
+if __name__ == "__main__":
+    arg_check()
+    file_name = sys.argv[1]
+    hashtag = sys.argv[2]
+    path = f"./{hashtag}_posts.csv"
+    if os.path.exists(path):
+        print(f'The file {path} containing hashtag occurances already exists. If you would like to run the script afresh, please delete the file {path} and re-run the script.')
+        sys.exit()
+    else:
+        positions = get_hashtag_positions(file_name, hashtag)
+        if positions:
+            print_posts(file_name, path, hashtag, positions)
+        else:
+            print(f'{hashtag} not found!!!!')
+            sys.exit()