From 5c5833421e22e1c03b96b0cb99269b2c29d75f6a Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Tue, 20 Jul 2021 22:18:15 +0200
Subject: [PATCH 01/23] Add files via upload

---
 data_processor.sh | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 extract_date.py   | 56 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 data_processor.sh
 create mode 100644 extract_date.py

diff --git a/data_processor.sh b/data_processor.sh
new file mode 100644
index 0000000..e0b4f66
--- /dev/null
+++ b/data_processor.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+counter=0
+
+function join_lines {
+    local IFS="$1"
+    shift
+    echo "$*"
+}
+
+
+while IFS= read -r line || [ -n "$line" ]; do
+    if [[ -z ${line} ]];
+    then
+        :
+    elif [[ ${line: -1}  != '"' ]]; 
+    then
+	    to_combine[$counter]=$line
+        let "counter=counter+1"
+    elif [[ ${line: 0} != '"' && ${line: -1} == '"' ]];
+    then
+        to_combine[$counter]=$line
+        joined=$(join_lines " " "${to_combine[@]}")
+        #joined=$(join_lines " " "${to_combine[@]}" | tr -d "\n") # Mac sometimes introduces new lines, tr is used to remove newlines from joined.
+        echo "$joined" >> tmp.csv
+        unset to_combine
+        let "counter=0"
+    else
+        echo "$line" >> tmp.csv
+    fi
+done < "$1"
+
+while IFS= read -r line || [ -n "$line" ]; do
+    if [[ ${line: 0} == '"' ]];
+    then
+        if [[ -f anomalies_$1 ]];
+        then
+            echo "${line}" >> anomalies_$1
+        else
+            touch anomalies_$1
+            echo "${line}" >> anomalies_$1
+        fi
+    else
+        echo "${line}" >> clean-data_$1
+    fi
+done < tmp.csv
+
+rm -f tmp.csv
+
+if [[ ( -f "anomalies.csv") && ($(tr -d '\n\r\t' < anomalies.csv | wc -c) -eq 0) ]]; 
+then 
+    anmls=$(wc -l anomalies_$1 | awk '{print $1}')
+    echo "Anomalies found!!!!! ${anmls} lines of anomalies are recorded in anomalies_$1."
+else
+    input_data=$(wc -l $1 | awk '{print $1}')
+    clean_lines=$(wc -l clean-data_$1 | awk '{print $1}')
+    echo "${clean_lines} lines of clean data out of ${input_data} is recorded in clean-data_$1."
+fi
diff --git a/extract_date.py b/extract_date.py
new file mode 100644
index 0000000..788cba7
--- /dev/null
+++ b/extract_date.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import datetime
+import collections
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+
+
+if len(sys.argv) < 3:
+    print(f'ERROR: Please make sure the command line has the following format: python3 extract_date.py hashtag_data.json hashtag')
+    sys.exit()
+
+
+def list_to_frequency(li):
+    if li and (type(li) == list):
+        return collections.Counter(li)
+    else:
+        print(f"ERROR: either {li} is empty or not a list.")
+
+
+def eligibility_check(obj):
+    if not obj:
+        print(f'ERROR: {obj} is empty!')
+        return False
+    elif type(obj) != int:
+        print(f'ERROR: {obj} is not an integer as is expected!')
+        return False
+    else:
+        return True
+
+with open(sys.argv[1]) as file:
+    object = json.load(file)
+    l = len(object)
+    date_list = []
+    for i in range(0, l):
+        obj = object[i]["createTime"]
+        if eligibility_check(obj):
+            dt_obj = datetime.datetime.fromtimestamp(obj)
+            date_list.append(dt_obj.date())
+        else:
+            print(f'ERROR: Some error occured. Check {obj}.')
+    ordered = dict(list_to_frequency(date_list))
+    dates = list(ordered.keys())
+    total_dates = len(dates)
+    frequency = list(ordered.values())
+    plt.scatter(dates, frequency)
+    plt.gcf().autofmt_xdate()
+    date_format = mdates.DateFormatter('%d-%m-%Y')
+    plt.gca().xaxis.set_major_formatter(date_format)
+    plt.tight_layout()
+    plt.title(f'Hashtag Lifecyle - #{sys.argv[2]}')
+    plt.xlabel(f'Dates ({total_dates} dates out of {l} posts)')
+    plt.ylabel('Posts')
+    plt.show()

From 287fab2e67e4bb2e91610f4d3fcad9c8da01c829 Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Tue, 20 Jul 2021 22:26:33 +0200
Subject: [PATCH 02/23] Create README.md

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..301f450
--- /dev/null
+++ b/README.md
@@ -0,0 +1,6 @@
+# TikTok_plotter 
+1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
+2. Use the following command: python3 extract_date.py target_file.json hashtag
+
+
+The command in point 2 uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file.

From 5646067778eed3c3c519142fa6969c979569e68f Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Tue, 10 Aug 2021 19:06:23 +0200
Subject: [PATCH 03/23] Add files via upload

---
 extract_hashtag.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 extract_hashtag.py

diff --git a/extract_hashtag.py b/extract_hashtag.py
new file mode 100644
index 0000000..380c174
--- /dev/null
+++ b/extract_hashtag.py
@@ -0,0 +1,71 @@
+import os, sys
+import csv, json
+import matplotlib.pyplot as plt
+from collections import Counter, OrderedDict
+
+
+def get_hashtag_list(obj):
+    if not obj:
+        print(f'ERROR: Empty item, no hashtags to be extracted.')
+        return
+    else:
+        hashtag_list = []
+        length = len(obj)
+        for i in range(length):
+            for hashtag in obj[i]['hashtags']:
+                hashtag_list.append(hashtag['name'])
+    return hashtag_list
+
+
+def create_csv(file_name, d):
+    base = os.path.splitext(file_name)[0]
+    path = f"./{base}_sorted_hashtags.csv"
+    if os.path.exists(path):
+        print(f'The file {path} containing hashtag occurances already exists.')
+        return None
+    else:
+        with open(path, "w") as f:
+            f.write(f"Name, Occurances" + "\n")
+            for key,value in d.items():
+                f.write(f"{key}, {value}" + "\n")
+        print(f'The sorted hashtag occcurances list is contained in the file {path}.')
+        return None
+
+
+def plot_hashtag_occurances(file_name, plots):
+    with open(file_name) as f:
+        obj = json.load(f)
+        length = len(obj)
+        hashtag_list = get_hashtag_list(obj)
+        hashtags = Counter(hashtag_list).most_common()
+        hashtags_sorted = {k:v for (k,v) in hashtags}
+        create_csv(file_name, hashtags_sorted)  
+        k = list(hashtags_sorted.keys())
+        v = list(hashtags_sorted.values()) 
+        k = k[:plots]
+        v = v[:plots]
+        plt.scatter(k, v)
+        plt.tight_layout()
+        plt.title(f'Hashtag Distribution')
+        plt.xlabel(f'Top {plots} hashtags from {length} posts.')
+        plt.ylabel(f'Number of occurances')
+        plt.show()
+    return
+
+
+
+if len(sys.argv) != 3:
+    print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.')
+    sys.exit()
+else:
+    try:
+        int(sys.argv[2])
+    except:
+        print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.')
+        raise
+    
+    try:
+        plot_hashtag_occurances(sys.argv[1], int(sys.argv[2]))
+    except:
+        print("Unexpected error:", sys.exc_info()[0])
+        raise

From db74373c3d0e5559f7afc79d2e90c7b4d683aa78 Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Tue, 10 Aug 2021 19:09:00 +0200
Subject: [PATCH 04/23] Update README.md

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 301f450..93d90b4 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,11 @@
 # TikTok_plotter 
+## extract_date.py
 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
 2. Use the following command: python3 extract_date.py target_file.json hashtag
 
 
 The command in point 2 uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file.
+
+## extract_hashtag.py
+1. Use the following command: python3 extract_hashtag.py target_file.json n
+2. It will plot top n hashtag frequencies. 

From 29e87b75c338b2683be53aac4bcb650836694977 Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Wed, 11 Aug 2021 18:34:46 +0200
Subject: [PATCH 05/23] Add files via upload

---
 extract_hashtag.py | 110 +++++++++++++++++++++++----------------------
 extract_posts.py   |  64 ++++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 53 deletions(-)
 create mode 100644 extract_posts.py

diff --git a/extract_hashtag.py b/extract_hashtag.py
index 380c174..2769e3d 100644
--- a/extract_hashtag.py
+++ b/extract_hashtag.py
@@ -1,71 +1,75 @@
 import os, sys
 import csv, json
 import matplotlib.pyplot as plt
-from collections import Counter, OrderedDict
 
 
-def get_hashtag_list(obj):
+
+def get_hashtags(obj):
     if not obj:
         print(f'ERROR: Empty item, no hashtags to be extracted.')
         return
     else:
-        hashtag_list = []
-        length = len(obj)
-        for i in range(length):
+        hashtags = {}
+        l = len(obj)
+        for i in range(l):
             for hashtag in obj[i]['hashtags']:
-                hashtag_list.append(hashtag['name'])
-    return hashtag_list
+                if hashtag['name'] in hashtags:
+                    hashtags[hashtag['name']].add(i)
+                else:
+                    hashtags[hashtag['name']] = {i}
+    return hashtags
 
 
-def create_csv(file_name, d):
+def create_csv(file_name, path, d):
+    with open(path, "w") as f:
+        f.write(f"Name, Occurances, Positions" + "\n")
+        for key,value in d.items():
+            f.write(f"{key}, {value[0]}, " + f"{value[1]}".replace(",", ";") + "\n")
+    print(f'The sorted hashtag occcurances list is contained in the file {path}.')
+    return None
+
+
+def plot_occurances(file_name, plots):
     base = os.path.splitext(file_name)[0]
     path = f"./{base}_sorted_hashtags.csv"
     if os.path.exists(path):
-        print(f'The file {path} containing hashtag occurances already exists.')
-        return None
+        print(f'The file {path} containing hashtag occurances already exists. If you would like to generate a plot, please delete the file {path} and re-run the script.')
+        return 
     else:
-        with open(path, "w") as f:
-            f.write(f"Name, Occurances" + "\n")
-            for key,value in d.items():
-                f.write(f"{key}, {value}" + "\n")
-        print(f'The sorted hashtag occcurances list is contained in the file {path}.')
-        return None
+        with open(file_name) as f:
+            obj = json.load(f)
+            l = len(obj)
+            tags = get_hashtags(obj)
+            tags = {key: (len(value), value) for (key, value) in tags.items()}
+            sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)}
+            create_csv(file_name, path, sorted_tags)
+            k = list(sorted_tags.keys())
+            v = list(sorted_tags.values())
+            v = [i[0] for i in v]
+            k = k[:plots]
+            v = v[:plots]
+            plt.scatter(k, v)
+            plt.tight_layout()
+            plt.title(f'Hashtag Distribution')
+            plt.xlabel(f'Top {plots} hashtags from {l} posts.')
+            plt.ylabel(f'Number of occurances')
+            plt.show()
+        return
 
 
-def plot_hashtag_occurances(file_name, plots):
-    with open(file_name) as f:
-        obj = json.load(f)
-        length = len(obj)
-        hashtag_list = get_hashtag_list(obj)
-        hashtags = Counter(hashtag_list).most_common()
-        hashtags_sorted = {k:v for (k,v) in hashtags}
-        create_csv(file_name, hashtags_sorted)  
-        k = list(hashtags_sorted.keys())
-        v = list(hashtags_sorted.values()) 
-        k = k[:plots]
-        v = v[:plots]
-        plt.scatter(k, v)
-        plt.tight_layout()
-        plt.title(f'Hashtag Distribution')
-        plt.xlabel(f'Top {plots} hashtags from {length} posts.')
-        plt.ylabel(f'Number of occurances')
-        plt.show()
-    return
-
-
-
-if len(sys.argv) != 3:
-    print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.')
-    sys.exit()
-else:
-    try:
-        int(sys.argv[2])
-    except:
-        print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.')
-        raise
-    
-    try:
-        plot_hashtag_occurances(sys.argv[1], int(sys.argv[2]))
-    except:
-        print("Unexpected error:", sys.exc_info()[0])
-        raise
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.')
+        sys.exit()
+    else:
+        try:
+            int(sys.argv[2])
+        except:
+            print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.')
+            raise
+        
+        try:
+            plot_occurances(sys.argv[1], int(sys.argv[2]))
+        except:
+            print("Unexpected error:", sys.exc_info()[0])
+            raise
diff --git a/extract_posts.py b/extract_posts.py
new file mode 100644
index 0000000..c4bb435
--- /dev/null
+++ b/extract_posts.py
@@ -0,0 +1,64 @@
+import os, sys
+import csv, json
+import re
+from pandas import *
+
+def arg_check():
+    if len(sys.argv) != 3:
+        print(f'ERROR: Please make sure you enter the following in the command line: python3 extract_posts.py file.json hashtag')
+        sys.exit()
+    else:
+        return
+
+def get_hashtag_positions(file_name, hashtag):
+    base = os.path.splitext(file_name)[0]
+    path = f"./{base}_sorted_hashtags.csv"
+    if not os.path.exists(path):
+        print(f'Generating {path} ...')
+        os.system(f'python3 extract_hashtag.py {file_name} {1}')
+
+    return tag_membership(hashtag, path)
+
+
+def tag_membership(hashtag, path):
+    data = read_csv(path)
+    position_str = list(data[data["Name"] == hashtag].values[:, 2])
+    if position_str:
+        position_str = re.split('{|}', str(position_str))[1]
+        p = position_str.replace(";", ",")
+        positions = [int(s) for s in p.split(",")]
+        return positions
+    else:
+        return
+
+
+def print_posts(file_name, path, hashtag, positions):
+    with open(file_name) as f:
+        data = json.load(f)
+        posts = []
+        for p in positions:
+            posts.append(data[p])
+        keys = posts[0].keys()
+        with open(path, 'w', newline='') as csv_file:
+            writer = csv.DictWriter(csv_file, keys)
+            writer.writeheader()
+            writer.writerows(posts)
+    print(f'The posts are contained in the file {path}.')
+    return
+
+
+if __name__ == "__main__":
+    arg_check()
+    file_name = sys.argv[1]
+    hashtag = sys.argv[2]
+    path = f"./{hashtag}_posts.csv"
+    if os.path.exists(path):
+        print(f'The file {path} containing hashtag occurances already exists. If you would like to run the script afresh, please delete the file {path} and re-run the script.')
+        sys.exit()
+    else:
+        positions = get_hashtag_positions(file_name, hashtag)
+        if positions:
+            print_posts(file_name, path, hashtag, positions)
+        else:
+            print(f'{hashtag} not found!!!!')
+            sys.exit()

From 28b6ede51f68f2ead2aad530708cab31224ccc40 Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Wed, 11 Aug 2021 18:39:10 +0200
Subject: [PATCH 06/23] Update README.md

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 93d90b4..e6aa515 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
 # TikTok_plotter 
+The project provides tools to analyze hashtags within posts scraped from TikTok.
+
 ## extract_date.py
 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
 2. Use the following command: python3 extract_date.py target_file.json hashtag
@@ -9,3 +11,6 @@ The command in point 2 uses the extract_date.py script to extract the dates and
 ## extract_hashtag.py
 1. Use the following command: python3 extract_hashtag.py target_file.json n
 2. It will plot top n hashtag frequencies. 
+
+## extract_posts.py
+1. Use the following command: python3 extract_posts.py target_file.json hashtag

From 2d6adc0028309b5687bdfc147d36be1955c920cf Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Wed, 11 Aug 2021 18:59:39 +0200
Subject: [PATCH 07/23] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e6aa515..111e944 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# TikTok_plotter 
+# tiktok hashtag analysis toolset 
 The project provides tools to analyze hashtags within posts scraped from TikTok.
 
 ## extract_date.py

From 30f9bd9b2776d04af6118bd0f0a7b99b98519cba Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Wed, 11 Aug 2021 19:00:47 +0200
Subject: [PATCH 08/23] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 111e944..a5febe0 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ The command in point 2 uses the extract_date.py script to extract the dates and
 
 ## extract_hashtag.py
 1. Use the following command: python3 extract_hashtag.py target_file.json n
-2. It will plot top n hashtag frequencies. 
+2. It will plot top n hashtag frequencies. Recommendation n < = 10 for easy to read and analyze.
 
 ## extract_posts.py
 1. Use the following command: python3 extract_posts.py target_file.json hashtag

From f3172f6d1c3002079afc452fb8443537fbf4af75 Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Wed, 11 Aug 2021 19:10:04 +0200
Subject: [PATCH 09/23] Update README.md

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index a5febe0..3baf449 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,11 @@
 # tiktok hashtag analysis toolset 
-The project provides tools to analyze hashtags within posts scraped from TikTok.
+The project provides tools to analyze hashtags based on data downloaded using tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper).
+
+## Pre-conditions
+1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
 
 ## extract_date.py
-1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
-2. Use the following command: python3 extract_date.py target_file.json hashtag
-
+1. Use the following command: python3 extract_date.py target_file.json hashtag_name
 
 The command in point 2 uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file.
 
@@ -13,4 +14,4 @@ The command in point 2 uses the extract_date.py script to extract the dates and
 2. It will plot top n hashtag frequencies. Recommendation n < = 10 for easy to read and analyze.
 
 ## extract_posts.py
-1. Use the following command: python3 extract_posts.py target_file.json hashtag
+1. Use the following command: python3 extract_posts.py target_file.json hashtag_name

From 91b68cb54ec0fd1d81729cd29b4da6cc078bcff9 Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Wed, 11 Aug 2021 19:13:57 +0200
Subject: [PATCH 10/23] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3baf449..19155a0 100644
--- a/README.md
+++ b/README.md
@@ -6,12 +6,12 @@ The project provides tools to analyze hashtags based on data downloaded using ti
 
 ## extract_date.py
 1. Use the following command: python3 extract_date.py target_file.json hashtag_name
-
-The command in point 2 uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file.
+2. The command above uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file.
 
 ## extract_hashtag.py
 1. Use the following command: python3 extract_hashtag.py target_file.json n
-2. It will plot top n hashtag frequencies. Recommendation n < = 10 for easy to read and analyze.
+2. The command above will plot top n hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze.
 
 ## extract_posts.py
 1. Use the following command: python3 extract_posts.py target_file.json hashtag_name
+2. The command above pulls out all the posts for the hashtag hashtag_name from the downloaded tiktok scraper data. 

From 1172aa1792e12aebe7703f337b87d9fcd2be3234 Mon Sep 17 00:00:00 2001
From: johannawild <72805812+johannawild@users.noreply.github.com>
Date: Wed, 11 Aug 2021 20:04:10 +0200
Subject: [PATCH 11/23] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 19155a0..1d9a6b6 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@ The project provides tools to analyze hashtags based on data downloaded using ti
 
 ## Pre-conditions
 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
+2. Download posts relating to a hashtag in json format. Example: tiktok-scraper tokyo2021 -t 'json'
 
 ## extract_date.py
 1. Use the following command: python3 extract_date.py target_file.json hashtag_name

From 39ae6ff2d2b670946b703c9e4ba8e548425e1a64 Mon Sep 17 00:00:00 2001
From: johannawild <72805812+johannawild@users.noreply.github.com>
Date: Wed, 11 Aug 2021 20:05:35 +0200
Subject: [PATCH 12/23] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 1d9a6b6..7686432 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,11 @@ The project provides tools to analyze hashtags based on data downloaded using ti
 
 ## Pre-conditions
 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
-2. Download posts relating to a hashtag in json format. Example: tiktok-scraper tokyo2021 -t 'json'
+2. Download posts relating to a hashtag in **json** format. Example: tiktok-scraper tokyo2021 -t 'json'
 
 ## extract_date.py
 1. Use the following command: python3 extract_date.py target_file.json hashtag_name
-2. The command above uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the .json file.
+2. The command above uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the '.json' file.
 
 ## extract_hashtag.py
 1. Use the following command: python3 extract_hashtag.py target_file.json n

From 2d32677dc9a17afe416ee1219efb46dd60e24520 Mon Sep 17 00:00:00 2001
From: johannawild <72805812+johannawild@users.noreply.github.com>
Date: Wed, 11 Aug 2021 20:05:54 +0200
Subject: [PATCH 13/23] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7686432..401207e 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ The project provides tools to analyze hashtags based on data downloaded using ti
 
 ## Pre-conditions
 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
-2. Download posts relating to a hashtag in **json** format. Example: tiktok-scraper tokyo2021 -t 'json'
+2. Download posts relating to a hashtag in the **json** format. Example: tiktok-scraper tokyo2021 -t 'json'
 
 ## extract_date.py
 1. Use the following command: python3 extract_date.py target_file.json hashtag_name

From 64568058f36392f04de2360aca578bb08e97abf2 Mon Sep 17 00:00:00 2001
From: johannawild <72805812+johannawild@users.noreply.github.com>
Date: Wed, 11 Aug 2021 20:06:27 +0200
Subject: [PATCH 14/23] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 401207e..464e3ef 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ The project provides tools to analyze hashtags based on data downloaded using ti
 
 ## Pre-conditions
 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
-2. Download posts relating to a hashtag in the **json** format. Example: tiktok-scraper tokyo2021 -t 'json'
+2. Download posts relating to a hashtag in the **json** format. Example: tiktok-scraper hashtag tokyo2021 -t 'json'
 
 ## extract_date.py
 1. Use the following command: python3 extract_date.py target_file.json hashtag_name

From f4f66bfd21f6ca3afc0c1fc4a8f76d9dde1833e9 Mon Sep 17 00:00:00 2001
From: johannawild <72805812+johannawild@users.noreply.github.com>
Date: Wed, 11 Aug 2021 20:06:56 +0200
Subject: [PATCH 15/23] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 464e3ef..4727506 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ The project provides tools to analyze hashtags based on data downloaded using ti
 
 ## extract_hashtag.py
 1. Use the following command: python3 extract_hashtag.py target_file.json n
-2. The command above will plot top n hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze.
+2. The command above will plot top **n** hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze.
 
 ## extract_posts.py
 1. Use the following command: python3 extract_posts.py target_file.json hashtag_name

From b429c4a88abcc8a712a00d34fbd5ceb4070827d5 Mon Sep 17 00:00:00 2001
From: johannawild <72805812+johannawild@users.noreply.github.com>
Date: Wed, 11 Aug 2021 20:07:29 +0200
Subject: [PATCH 16/23] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4727506..2461537 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # tiktok hashtag analysis toolset 
 The project provides tools to analyze hashtags based on data downloaded using tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper).
 
-## Pre-conditions
+## Pre-requisites
 1. Download and install TikTok scraper: https://github.com/drawrowfly/tiktok-scraper
 2. Download posts relating to a hashtag in the **json** format. Example: tiktok-scraper hashtag tokyo2021 -t 'json'
 

From d6d325b31a7470d1fc83f20068b81756b3e818ef Mon Sep 17 00:00:00 2001
From: johannawild <72805812+johannawild@users.noreply.github.com>
Date: Wed, 11 Aug 2021 20:08:47 +0200
Subject: [PATCH 17/23] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2461537..213f933 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# tiktok hashtag analysis toolset 
+# TikTok hashtag analysis toolset 
 The project provides tools to analyze hashtags based on data downloaded using tiktok-scraper (https://github.com/drawrowfly/tiktok-scraper).
 
 ## Pre-requisites

From 37998dd7769fa8469a66d1bbaac764e38e793158 Mon Sep 17 00:00:00 2001
From: johannawild <72805812+johannawild@users.noreply.github.com>
Date: Mon, 25 Oct 2021 13:52:46 +0200
Subject: [PATCH 18/23] Add files via upload

---
 extract_hashtag.py        | 104 +++++++++++++++++++++-----------------
 extract_posts.py          |  85 +++++++++++++------------------
 top_hashtag_occurances.py |  83 ++++++++++++++++++++++++++++++
 3 files changed, 176 insertions(+), 96 deletions(-)
 create mode 100644 top_hashtag_occurances.py

diff --git a/extract_hashtag.py b/extract_hashtag.py
index 2769e3d..0b9e1fc 100644
--- a/extract_hashtag.py
+++ b/extract_hashtag.py
@@ -1,5 +1,6 @@
 import os, sys
 import csv, json
+import argparse
 import matplotlib.pyplot as plt
 
 
@@ -20,56 +21,67 @@ def get_hashtags(obj):
     return hashtags
 
 
-def create_csv(file_name, path, d):
-    with open(path, "w") as f:
-        f.write(f"Name, Occurances, Positions" + "\n")
-        for key,value in d.items():
-            f.write(f"{key}, {value[0]}, " + f"{value[1]}".replace(",", ";") + "\n")
-    print(f'The sorted hashtag occcurances list is contained in the file {path}.')
-    return None
-
-
-def plot_occurances(file_name, plots):
-    base = os.path.splitext(file_name)[0]
-    path = f"./{base}_sorted_hashtags.csv"
-    if os.path.exists(path):
-        print(f'The file {path} containing hashtag occurances already exists. If you would like to generate a plot, please delete the file {path} and re-run the script.')
-        return 
-    else:
-        with open(file_name) as f:
-            obj = json.load(f)
-            l = len(obj)
-            tags = get_hashtags(obj)
-            tags = {key: (len(value), value) for (key, value) in tags.items()}
+def get_occurances(filename, n=1 , sort=True):
+    with open(filename) as f:
+        obj = json.load(f)
+        l = len(obj)
+        tags = get_hashtags(obj)
+        tags = {key: (len(value), value) for (key, value) in tags.items()}
+        if not sort:
+            k = list(tags.keys())
+            v = list(tags.values())
+            return obj, k, v 
+        else:
             sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)}
-            create_csv(file_name, path, sorted_tags)
             k = list(sorted_tags.keys())
             v = list(sorted_tags.values())
-            v = [i[0] for i in v]
-            k = k[:plots]
-            v = v[:plots]
-            plt.scatter(k, v)
-            plt.tight_layout()
-            plt.title(f'Hashtag Distribution')
-            plt.xlabel(f'Top {plots} hashtags from {l} posts.')
-            plt.ylabel(f'Number of occurances')
-            plt.show()
-        return
+            k = k[:n]
+            v_total = [i[0] for i in v]
+            v_total = v_total[:n]
+            return l, k, v_total
+
+
+
+def plot(n, length, k, v):
+    plt.scatter(k, v)
+    plt.tight_layout()
+    plt.title(f'Hashtag Distribution')
+    plt.xlabel(f'Top {n} hashtags from {length} posts.')
+    plt.ylabel(f'Number of occurances')
+    plt.show()
+    return
+
+
+def print_occurances(k, v):
+    row_number = 0
+    print(f'Hashtag  Occurances')
+    for key,value in zip(k, v):
+        print(f'{row_number}\t{key}\t\t{value}')
+        row_number += 1
+    return
+
 
 
 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.')
-        sys.exit()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_file", help="The json hashtag file name")
+    parser.add_argument("n", help="The number of top n occurances", type=int)
+    parser.add_argument("-p", "--plot", help="Plot the occurances", action="store_true")
+    parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true")
+    args = parser.parse_args()
+    if args.input_file and args.n:
+        if args.n < 1:
+            print(f"Please make sure the number of top occurances is a positive integer.")
+            sys.exit()
+
+        base = os.path.splitext(args.input_file)[0]
+        path = f"./{base}_sorted_hashtags.csv"
+        if args.plot:
+            length, keys, values = get_occurances(args.input_file, args.n)
+            plot(args.n, length, keys, values)
+        else:
+            length, keys, values = get_occurances(args.input_file, args.n)
+            print_occurances(keys, values)
     else:
-        try:
-            int(sys.argv[2])
-        except:
-            print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.')
-            raise
-        
-        try:
-            plot_occurances(sys.argv[1], int(sys.argv[2]))
-        except:
-            print("Unexpected error:", sys.exc_info()[0])
-            raise
+        print(f'ERROR: either {args.input_file} or {args.n} or both contains error.')
+            
diff --git a/extract_posts.py b/extract_posts.py
index c4bb435..721393b 100644
--- a/extract_posts.py
+++ b/extract_posts.py
@@ -1,64 +1,49 @@
 import os, sys
-import csv, json
-import re
-from pandas import *
-
-def arg_check():
-    if len(sys.argv) != 3:
-        print(f'ERROR: Please make sure you enter the following in the command line: python3 extract_posts.py file.json hashtag')
-        sys.exit()
-    else:
-        return
-
-def get_hashtag_positions(file_name, hashtag):
-    base = os.path.splitext(file_name)[0]
-    path = f"./{base}_sorted_hashtags.csv"
-    if not os.path.exists(path):
-        print(f'Generating {path} ...')
-        os.system(f'python3 extract_hashtag.py {file_name} {1}')
-
-    return tag_membership(hashtag, path)
+from extract_hashtag import get_occurances
 
 
-def tag_membership(hashtag, path):
-    data = read_csv(path)
-    position_str = list(data[data["Name"] == hashtag].values[:, 2])
-    if position_str:
-        position_str = re.split('{|}', str(position_str))[1]
-        p = position_str.replace(";", ",")
-        positions = [int(s) for s in p.split(",")]
-        return positions
-    else:
-        return
+def filter_positions(hashtags, keys, positions):
+    filtered = []
+    for hashtag in hashtags:
+        try: 
+            i = keys.index(hashtag)
+            key = keys[i]
+            post_indices = positions[i][1]
+            filtered.append((key, post_indices))
+        except Exception as error:
+            print(error)
+            continue
+    return filtered
 
 
-def print_posts(file_name, path, hashtag, positions):
-    with open(file_name) as f:
-        data = json.load(f)
-        posts = []
-        for p in positions:
-            posts.append(data[p])
-        keys = posts[0].keys()
-        with open(path, 'w', newline='') as csv_file:
-            writer = csv.DictWriter(csv_file, keys)
-            writer.writeheader()
-            writer.writerows(posts)
-    print(f'The posts are contained in the file {path}.')
-    return
+def write_posts(path, obj, filtered):
+    length = len(filtered)
+    with open(path, "w") as output_file:
+        for i in range(length):
+            hashtag = filtered[i][0]
+            total_positions = len(filtered[i][1])
+            positions = list(filtered[i][1])
+            first_position = positions[0]
+            output_file.write(f"{hashtag}, {obj[first_position]}" + "\n")
+            for p in range(1, total_positions):
+                output_file.write(f" , {obj[positions[p]]}" + "\n")
+            print(f"{total_positions} posts written for the hashtag - {hashtag}")
 
 
 if __name__ == "__main__":
-    arg_check()
     file_name = sys.argv[1]
-    hashtag = sys.argv[2]
-    path = f"./{hashtag}_posts.csv"
+    hashtags = list(sys.argv[2:])
+    name = f"{hashtags[0]}_{len(hashtags)}"
+    path = f"../{name}_posts.csv"
     if os.path.exists(path):
         print(f'The file {path} containing hashtag occurances already exists. If you would like to run the script afresh, please delete the file {path} and re-run the script.')
         sys.exit()
     else:
-        positions = get_hashtag_positions(file_name, hashtag)
-        if positions:
-            print_posts(file_name, path, hashtag, positions)
+        obj, keys, positions = get_occurances(file_name, sort=False)
+        filtered = filter_positions(hashtags, keys, positions)
+        if filtered:
+            write_posts(path, obj, filtered)
         else:
-            print(f'{hashtag} not found!!!!')
-            sys.exit()
+            print(f"No posts found for the hashtags you entered.")
+        
+
diff --git a/top_hashtag_occurances.py b/top_hashtag_occurances.py
new file mode 100644
index 0000000..dcee5aa
--- /dev/null
+++ b/top_hashtag_occurances.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python3
+
+import os, time
+import json
+import argparse
+from datetime import datetime
+
+
+def parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("hashtags", help="The hashtags to be processed", nargs="+")
+    parser.add_argument("top_n", help="Top n occurances for a hashtag", type=int)
+    args = parser.parse_args()
+    return args
+
+
+def check_file_existence(hashtag, contains=None):
+    pwd = "./"
+    for i in os.listdir(pwd):
+        #if os.path.isfile(os.path.join(pwd, i)) and hashtag in i:
+        if hashtag in i and contains in i:
+            return i
+        elif hashtag in i:
+            return i
+        else:
+            continue
+    return
+
+
+def get_input_file(hashtag):
+    check_file = check_file_existence(hashtag, "json")
+    if check_file:
+        return check_file
+    else:
+        try: 
+            os.system(f"tiktok-scraper hashtag {hashtag} -t json")
+            c = check_file_existence(hashtag, "json")
+            if c:
+                return c
+            else:
+                print(f"ERROR: No json file relating to {hashtag} found.")
+        except:
+            raise
+
+
+def copy_data(input_file, output_file):
+    os.system(f"cat {input_file} >> {output_file} && echo >> {output_file}")
+    return
+
+
+def get_data(hashtag, n):
+    input_file = get_input_file(hashtag)
+    if input_file:
+        os.system(f"python3 extract_hashtag.py {input_file} {str(n)} -o")
+        base = os.path.splitext(input_file)[0]
+        data_file = f"{base}_sorted_hashtags.csv"
+        if os.path.exists(data_file):
+            return data_file
+    return 
+
+
+def get_occurances(hashtag, n, output):
+    data_file = get_data(hashtag, n)
+    copy_data(data_file, output)
+    os.system(f"rm {data_file}")
+    print(f"{data_file} removed ....")
+
+
+if __name__ == "__main__":
+    args = parser()
+    hashtags = args.hashtags
+    now = datetime.now().strftime("%d%m%Y-%H%M%S")
+    output = f"./{now}.csv"
+    l = len(hashtags)
+    if l > 1:
+        sleep = 30 # Sleep time (in secs) between two tiktok scraping requests.
+        get_occurances(hashtags[0], args.top_n, output)
+        for i in range(1, l):
+            time.sleep(30)
+            get_occurances(hashtags[i], args.top_n, output)
+    else:
+        get_occurances(hashtags[0], args.top_n, output)
+    print(f"The output data is stored in the file {output}")

From 8f9427a5f81f24725653b457e8e16d23e64ae3bd Mon Sep 17 00:00:00 2001
From: johannawild <72805812+johannawild@users.noreply.github.com>
Date: Mon, 25 Oct 2021 14:16:18 +0200
Subject: [PATCH 19/23] Update README.md

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 213f933..fce7f73 100644
--- a/README.md
+++ b/README.md
@@ -10,9 +10,10 @@ The project provides tools to analyze hashtags based on data downloaded using ti
 2. The command above uses the extract_date.py script to extract the dates and the corresponding number of hashtag posts for each date that the TikTok scraper retrieves in the '.json' file.
 
 ## extract_hashtag.py
-1. Use the following command: python3 extract_hashtag.py target_file.json n
-2. The command above will plot top **n** hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze.
+1. Use the following command to print the result on the screen: python3 extract_hashtag.py target_file.json n -d
+2. Use the following command to plot: python3 extract_hashtag.py target_file.json n -p
+3. The command above will plot top **n** hashtag frequencies based on the json file downloaded using tiktok scraper for a given hashtag. Recommendation n < = 10 for easy to read and analyze.
 
 ## extract_posts.py
-1. Use the following command: python3 extract_posts.py target_file.json hashtag_name
-2. The command above pulls out all the posts for the hashtag hashtag_name from the downloaded tiktok scraper data. 
+1. Use the following command: python3 extract_posts.py target_file.json hashtag_names
+2. The command above pulls out all the posts for the hashtag hashtag_names (enter multiple names with space) from the downloaded tiktok scraper data. 

From 2d3f4a9aab2a1839a55e9a7c89166e6fad8a107a Mon Sep 17 00:00:00 2001
From: johannawild <72805812+johannawild@users.noreply.github.com>
Date: Mon, 25 Oct 2021 14:16:33 +0200
Subject: [PATCH 20/23] Create README.md

corrections

From 2a34e03dc83249c46e8908fc02f785cc13ea2de2 Mon Sep 17 00:00:00 2001
From: X <work@Xs-MacBook-Pro.local>
Date: Sun, 30 Jan 2022 13:51:08 +0100
Subject: [PATCH 21/23] rebase

---
 analytics/hashtag_frequencies.py         |  90 ++++++++++
 analytics/logging_analytics.py           |   4 +
 tiktok_downloader/data_methods.py        | 123 +++++++++++++
 tiktok_downloader/file_methods.py        | 201 +++++++++++++++++++++
 tiktok_downloader/global_data.py         |  38 ++++
 tiktok_downloader/hashtag_list.py        |  37 ++++
 tiktok_downloader/hashtag_list_sample.py |   8 +
 tiktok_downloader/run_downloader.py      | 212 +++++++++++++++++++++++
 8 files changed, 713 insertions(+)
 create mode 100644 analytics/hashtag_frequencies.py
 create mode 100644 analytics/logging_analytics.py
 create mode 100644 tiktok_downloader/data_methods.py
 create mode 100644 tiktok_downloader/file_methods.py
 create mode 100644 tiktok_downloader/global_data.py
 create mode 100644 tiktok_downloader/hashtag_list.py
 create mode 100644 tiktok_downloader/hashtag_list_sample.py
 create mode 100644 tiktok_downloader/run_downloader.py

diff --git a/analytics/hashtag_frequencies.py b/analytics/hashtag_frequencies.py
new file mode 100644
index 0000000..3afd405
--- /dev/null
+++ b/analytics/hashtag_frequencies.py
@@ -0,0 +1,90 @@
+import os, sys
+import csv, json
+import argparse
+import matplotlib.pyplot as plt
+
+
+
+def get_hashtags(obj):
+    if not obj:
+        print(f'ERROR: Empty item, no hashtags to be extracted.')
+        return
+    else:
+        hashtags = {}
+        l = len(obj)
+        for i in range(l):
+            for hashtag in obj[i]['hashtags']:
+                if hashtag['name'] in hashtags:
+                    hashtags[hashtag['name']].add(i)
+                else:
+                    hashtags[hashtag['name']] = {i}
+    return hashtags
+
+
+def get_occurrences(filename, n=1 , sort=True):
+    with open(filename) as f:
+        obj = json.load(f)
+        l = len(obj)
+        tags = get_hashtags(obj)
+        tags = {key: (len(value), value) for (key, value) in tags.items()}
+        if not sort:
+            k = list(tags.keys())
+            v = list(tags.values())
+            return obj, k, v 
+        else:
+            sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)}
+            k = list(sorted_tags.keys())
+            v = list(sorted_tags.values())
+            k = k[:n]
+            v_total = [i[0] for i in v]
+            v_total = v_total[:n]
+            return l, k, v_total
+
+
+
+def plot(n, length, k, v):
+    plt.scatter(k, v)
+    plt.tight_layout()
+    plt.title(f'Hashtag Distribution')
+    plt.xlabel(f'Top {n} hashtags from {length} posts.')
+    plt.ylabel(f'Number of occurrences')
+    plt.show()
+    return
+
+
+def print_occurrences(l, k, v):
+    row_number = 0
+    total_posts = l
+    print ("{:<8} {:<15} {:<15} {:<15}".format("Rank", 'Hashtag','Occurrences',f'Frequency (Occurrences/Total-Posts({l}))'))
+    #print(f'Hashtag                  Occurrences                 Frequency(Occurances/Total-Posts)')
+    for key,value in zip(k, v):
+        ratio = value/total_posts 
+        print ("{:<8} {:<15} {:<15} {:<15}".format(row_number, key, value, ratio))
+        #print(f'{row_number}\t{key}\t\t{value}\t\t{ratio:.3f}')
+        row_number += 1
+    return
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_file", help="The json hashtag file name")
+    parser.add_argument("n", help="The number of top n occurrences", type=int)
+    parser.add_argument("-p", "--plot", help="Plot the occurrences", action="store_true")
+    parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true")
+    args = parser.parse_args()
+    if args.input_file and args.n:
+        if args.n < 1:
+            print(f"Please make sure the number of top occurrences is a positive integer.")
+            sys.exit()
+
+        base = os.path.splitext(args.input_file)[0]
+        path = f"./{base}_sorted_hashtags.csv"
+        if args.plot:
+            length, keys, values = get_occurrences(args.input_file, args.n)
+            plot(args.n, length, keys, values)
+        else:
+            length, keys, values = get_occurrences(args.input_file, args.n)
+            print_occurrences(length, keys, values)
+    else:
+        print(f'ERROR: either {args.input_file} or {args.n} or both contains error.')
diff --git a/analytics/logging_analytics.py b/analytics/logging_analytics.py
new file mode 100644
index 0000000..cba8ca5
--- /dev/null
+++ b/analytics/logging_analytics.py
@@ -0,0 +1,4 @@
+"""
+Yet to be written ...
+"""
+
diff --git a/tiktok_downloader/data_methods.py b/tiktok_downloader/data_methods.py
new file mode 100644
index 0000000..c35e2a4
--- /dev/null
+++ b/tiktok_downloader/data_methods.py
@@ -0,0 +1,123 @@
+import os
+from collections import namedtuple
+from datetime import datetime
+import global_data
+import file_methods
+
+
+Difference = namedtuple("Difference", "new_ids size")
+Total = namedtuple("Total", "total unique")
+
+
+def get_difference(tag, file, ids):
+    maiden_entry = False
+    current_id_data = file_methods.get_data(file)
+    if tag in current_id_data:
+        current_ids = current_id_data[tag]
+        set1 = set(current_ids)
+        set2 = set(ids)
+        new_ids = set2.difference(set1)
+        if new_ids:
+            new_ids = list(new_ids)
+            size = len(new_ids)
+            diff = Difference(new_ids, size)
+            return (diff, maiden_entry)
+        else:
+            return ([], maiden_entry)
+    else:
+        maiden_entry = True
+        return (ids, maiden_entry)
+
+
+def extract_posts(settings, file_name, tag):
+    ids = []
+    posts = []
+    new_posts = []
+
+    posts = file_methods.get_data(file_name)
+    for post in posts:
+        ids.append(post["id"])
+    if not ids:
+        print(f"WARNING: no posts were found for {tag} in the file - {file_name}")
+        return
+   
+    status = file_methods.check_existence(settings["post_ids"], "file")
+    if not status:
+        new_data = (ids, posts)
+        return new_data
+    else:
+        res = get_difference(tag, settings["post_ids"], ids)
+        if res[1]:
+            new_data = (ids, posts)
+            return new_data
+        else:
+            if res[0]:
+                for i in res[0].new_ids:
+                    for post in posts:
+                        if (i == post["id"]):
+                            new_posts.append(post)
+                new_data = (res[0].new_ids, new_posts)
+                return new_data
+            else:
+                print(f"WARNING: No new posts were found in the downloaded file - {file_name}")
+                return
+
+
+def extract_videos(settings, tag, download_list):
+    status = file_methods.check_existence(settings["video_ids"], "file")
+    if not status:
+        new_data = download_list
+        return new_data
+    else:
+        res = get_difference(tag, settings["video_ids"], download_list)
+        if res[1]:
+            return download_list
+        else:
+            if res[0]:
+                new_data = res[0].new_ids
+                return new_data
+            else:
+                print(f"WARNING: No new videos were found for the {tag} in the downloaded folder.")
+                return
+
+
+def update_posts(file_path, file_type, new_data, tag=None):
+    try:
+        status = file_methods.check_existence(file_path, file_type)
+        if not tag:
+            file_methods.post_writer(file_path, new_data, status)
+        else:
+            log = file_methods.id_writer(file_path, new_data, tag, status)
+            return log
+    except:
+        raise
+
+
+def update_videos(settings, new_data, tag):
+    file_path = settings["video_ids"]
+    file_methods.check_file(file_path, "file")
+    log = file_methods.id_writer(file_path, new_data, tag, True)
+    file_methods.clean_video_files(settings, tag, new_data)
+    return log
+
+
+def get_total_posts(file_path, tag):
+    status = file_methods.check_existence(file_path, "file")
+    if not status:
+        raise OSError("{file_path} not found!")
+    else:
+        data = file_methods.get_data(file_path)
+        total = len(data[tag])
+        unique = len(set(data[tag]))
+        total = Total(total, unique)
+        return total
+
+
+def print_total(file_path, tag, data_type):
+    total = get_total_posts(file_path, tag)
+    if (total.total == total.unique):
+        print(f"Total {data_type} for the hashtag {tag} are: {total.total}")
+        return
+    else:
+        print(f"WARNING: out of total {data_type} for the hashtag {tag} {total.total}, only {total.unique} are unique. Something is going wrong...")
+        return
diff --git a/tiktok_downloader/file_methods.py b/tiktok_downloader/file_methods.py
new file mode 100644
index 0000000..8842f07
--- /dev/null
+++ b/tiktok_downloader/file_methods.py
@@ -0,0 +1,201 @@
+import os, json, subprocess
+from datetime import datetime
+import global_data
+import data_methods
+
+
+def create_file(name, file_type):
+    if (file_type == "dir"):
+        os.makedirs(name, mode=0o777)
+    elif (file_type == "file"):
+        with open(name, "w"): pass
+    else:
+        print(f"ERROR: either {file_type} or is not well defined.")
+    return
+
+
+def check_existence(file_path, file_type):
+    if (file_type == "file"):
+        if os.path.isfile(file_path):
+            return True
+        else:
+            return False
+    elif (file_type == "dir"):
+        if os.path.isdir(file_path):
+            return True
+        else:
+            return False
+    else:
+        raise OSError(f"{file_type} has to be a 'dir' or a 'file'!!!")
+
+
+def check_file(file_path, file_type):
+    status = check_existence(file_path, file_type)
+    if not status:
+        create_file(file_path, file_type)    
+
+    return
+
+
+def download_posts(settings, tag):
+    path = os.path.join(settings["data"], tag, settings["posts"])
+    os.chdir(path)
+    try:
+        tiktok_command = f"tiktok-scraper hashtag {tag} -t 'json'" 
+        result = subprocess.run([tiktok_command], capture_output=True, shell=True)
+        if result.stdout:
+            new_file = result.stdout.decode('utf-8').split()[-1]
+            if ("json" in new_file):
+                os.chdir("../../../tiktok_downloader")
+                return new_file 
+            else:
+                print(f"ERROR: Something's wrong with what is returned by tiktok-scraper for the hashtag {tag} - *{new_file}* is not a json file!!!!")
+                os.chdir("../../../tiktok_downloader")
+                return
+        else:
+            os.chdir("../../../tiktok_downloader")
+            print(f"ERROR: No file was downloaded by the tiktok-scraper for the {tag} !!!!")
+            return
+    except: raise
+
+
+
+def download_videos(settings, tag):
+    path = os.path.join(settings["data"], tag, settings["videos"])
+    os.chdir(path)
+    try:
+        tiktok_command = f"tiktok-scraper hashtag {tag} -n {settings['number_of_videos']} -d" 
+        result = subprocess.run([tiktok_command], capture_output=True, shell=True)
+        if result.stdout:
+            downloaded_list_tmp = os.listdir(f"./#{tag}")
+            if downloaded_list_tmp:
+                downloaded_list = []
+                for file in downloaded_list_tmp:
+                    file = file[0:-4]
+                    downloaded_list.append(file)
+                
+                os.chdir("../../../tiktok_downloader")
+                return downloaded_list
+            else:
+                print(f"WARNING: No video files were downloaded for the hashtag {tag}.")
+                os.chdir("../../../tiktok_downloader")
+                subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True)
+        else:
+            os.chdir("../../../tiktok_downloader")
+            print(f"WARNING: Something went wrong with the tiktok-scraper video download for the {tag} !!!!")
+            return
+        
+    except: raise
+
+
+def get_data(file_path):
+    with open(file_path, "r") as f:
+        data = json.load(f)
+        return data
+
+
+def dump_data(file_path, data):
+    with open(file_path, "w") as f:
+        json.dump(data, f)
+        return            
+
+def log_writer(log_data):
+    total = 0
+    try:
+        log_dict = {}
+        for ele in log_data:
+            if ele[0] in log_dict:
+                if ele[1][0] in log_dict[ele[0]]:
+                    log_dict[ele[0]][ele[1][0]] += ele[1][1]
+                else:
+                    log_dict[ele[0]][ele[1][0]] = ele[1][1]
+                total += ele[1][1]
+            else:
+                log_dict[ele[0]] = { ele[1][0] : ele[1][1] }
+                total += ele[1][1]
+
+        logger = global_data.FILES["logger"]
+        now = datetime.now()
+        now_str = now.strftime("%d-%m-%Y %H:%M:%S")
+        status = check_existence(logger, "file")
+        if status:
+            data = get_data(logger)
+            data[now_str] = log_dict
+            dump_data(logger, data)
+        else:
+            data = { now_str : log_dict }
+            dump_data(logger, data)
+        print(f"Successfully logged {total} entries!!!!")
+        return
+    except: raise
+
+
+def id_writer(file_path, new_data, tag, status):
+    try:
+        total = len(new_data)
+        if status:
+            try:
+                data = get_data(file_path)
+                if tag in data:
+                    data[tag] += new_data
+                else:
+                    data[tag]= new_data 
+                dump_data(file_path, data)
+            except json.decoder.JSONDecodeError:
+                data = { tag : new_data }
+                dump_data(file_path, data)
+        else:
+            data = { tag : new_data }
+            dump_data(file_path, data)
+        print(f"SUCCESS - {total} entries added to {file_path}!!!")
+        log_data = (tag, total)
+        return log_data
+    except: raise
+
+
+def post_writer(file_path, new_data, status):
+    try:
+        total = len(new_data)
+        if status:
+            try:
+                data = get_data(file_path)
+                data += new_data
+                dump_data(file_path, data)
+            except json.decoder.JSONDecodeError:
+                data = new_data
+                dump_data(file_path, data)
+        else:
+            data = new_data
+            dump_data(file_path, data)
+        print(f"SUCCESS - {total} entries added to {file_path}!!!")
+        return
+    except: raise
+
+
+def delete_file(file_path, file_type):
+    if not check_existence(file_path, file_type):
+        print(f"ERROR: Attempt to delete failed. {file_path} does not exist!!!")
+    elif (file_type == "file"):
+        os.remove(file_path)
+        print(f"Successfully deleted {file_path}!!!")
+        return
+    elif (file_type == "dir"):
+        os.rmdir(file_path)
+        print(f"Successfully deleted {file_path}!!!")
+        return
+    else:
+        print(f"ERROR: {file_type} needs to be either 'file' or 'dir' !!!")
+        return
+
+
+def clean_video_files(settings, tag, new_data=None):
+    try:
+        if new_data:
+            for file in new_data:
+                settings["videos_from"] = settings['data'] + f"/{tag}/videos/#{tag}/{file}.mp4"
+                subprocess.call(f"mv {settings['videos_from']} {settings['videos_to']}", shell=True)
+             
+        subprocess.call(f"rm -rf {settings['videos_delete']}", shell=True)
+        print(f"Successfully deleted the folder {settings['videos_delete']} folder of videos.")
+    except:
+        raise
diff --git a/tiktok_downloader/global_data.py b/tiktok_downloader/global_data.py
new file mode 100644
index 0000000..b83df45
--- /dev/null
+++ b/tiktok_downloader/global_data.py
@@ -0,0 +1,38 @@
+# Directories
+DATA = "../data"
+IDS = "ids"
+LOG = "log"
+POSTS = "posts"
+VIDEOS = "videos"
+
+# Files
+POST_IDS = "post_ids.json"
+VIDEO_IDS = "video_ids.json"
+DATA_FILE = "data.json"
+LOG_FILE = "log.json"
+
+
+FILES = {
+            "data" : DATA,
+            "ids" : IDS,
+            "log" : LOG,
+            "posts" : POSTS,
+            "videos" : VIDEOS,
+            "post_ids" : f"{DATA}/{IDS}/{POST_IDS}",
+            "video_ids" : f"{DATA}/{IDS}/{VIDEO_IDS}",
+            "data_file" : f"{DATA_FILE}",
+            "downloads" : [],
+            "logger" : f"{DATA}/{LOG}/{LOG_FILE}",
+        }
+
+
+
+# Commands
+tag = ""
+
+COMMANDS = {
+            "number_of_videos" : 3, # Number of videos to be downloaded by tiktok-scraper.
+            "post_download" : f"tiktok-scraper hashtag {tag} -t 'json'",
+            "video_download" : f"tiktok-scraper hashtag {tag} -d",
+            "sleep" : 8
+        }
diff --git a/tiktok_downloader/hashtag_list.py b/tiktok_downloader/hashtag_list.py
new file mode 100644
index 0000000..e595523
--- /dev/null
+++ b/tiktok_downloader/hashtag_list.py
@@ -0,0 +1,37 @@
+hashtag_list = [
+# This is a sample hashtag list. Please enter your hashtag list (without the comment).
+#            "london",
+#            "paris",
+#            "newyork",
+#            "tokyo"
+            "uyghur",
+            "uyghur2021",
+            "uyghur2022",
+            "uyghurmuslims",
+            "xinjiang",
+            "xinjiangchina",
+            "xinjiangcotton",
+            "xinjiangtravel",
+            "uyghurlivesmatter",
+            "uighur",
+            "Uighurs",
+            "Uyghurs",
+            "uighuren",
+            "saveuyghur",
+            "uighurmuslims",
+            "chinesemuslim",
+            "uyghurpeople",
+            "urumqi",
+            "chinaxinjiang",
+            "xinjianguyghurs",
+            "eastturkestan",
+            "chinaconcentrationcamp",
+            "xinjianguyghur🇨🇳",
+            "kashgar",
+            "xinjiangreeducationcamps",
+            "uyghur_tiktok",
+            "uyghurreality",
+            "xinjiangdance",
+            "westernmedia",
+            "uyghurgenocide"
+        ]
diff --git a/tiktok_downloader/hashtag_list_sample.py b/tiktok_downloader/hashtag_list_sample.py
new file mode 100644
index 0000000..4ddff1a
--- /dev/null
+++ b/tiktok_downloader/hashtag_list_sample.py
@@ -0,0 +1,8 @@
+hashtag_list = [
+# This is a sample hashtag list. Please enter your hashtag list (without the comment).
+            "london",
+            "paris",
+            "newyork",
+            "tokyo"
+
+        ]
diff --git a/tiktok_downloader/run_downloader.py b/tiktok_downloader/run_downloader.py
new file mode 100644
index 0000000..d4ccffe
--- /dev/null
+++ b/tiktok_downloader/run_downloader.py
@@ -0,0 +1,212 @@
+import os, sys
+import time
+import json
+import argparse
+
+import global_data
+import file_methods
+import data_methods
+
+
+
+command = "python3 post_downloader.py "
+
+def get_hashtag_list():
+    try:
+        from hashtag_list import hashtag_list
+        return hashtag_list
+    except ImportError as error:
+        print("ImportError: " + str(error))
+        print(f"Please provide at least one hashtag either by entering as an argument or by adding hashtags to the list hashtag_list in the file hashtag_list.py")
+        sys.exit()
+
+
+def create_parser():
+    # Creating the parser
+    parser = argparse.ArgumentParser(description="Download the tiktoks for the requested hashtags")
+
+    # Adding the arguments
+    #parser.add_argument("--h", type=str, nargs="*", required=True, help="List of hashtags")
+    parser.add_argument("--h", type=str, nargs="*", help="List of hashtags")
+    parser.add_argument("-p", action="store_true", help="Download posts")
+    parser.add_argument("-v", action="store_true", help="Download videos")
+
+    return parser
+
+
+def set_download_settings(download_data_type):
+    settings = {}
+    settings["data"] = global_data.FILES["data"]
+    settings["ids"] = global_data.FILES["ids"]
+    settings["log"] = global_data.FILES["log"]
+    settings["logger"] = global_data.FILES["logger"]
+    settings["sleep"] = global_data.COMMANDS["sleep"]
+    file_methods.check_file(f"{settings['data']}/{settings['ids']}", "dir")
+    file_methods.check_file(f"{settings['data']}/{settings['log']}", "dir")
+    if download_data_type == "posts":
+        settings["posts"] = global_data.FILES["posts"]
+        settings["post_ids"] = global_data.FILES["post_ids"]
+        settings["post_download"] = global_data.COMMANDS["post_download"]
+        settings["data_file"] = global_data.FILES["data_file"]
+        return settings
+    elif download_data_type == "videos":
+        settings["videos"] = global_data.FILES["videos"]
+        settings["video_ids"] = global_data.FILES["video_ids"]
+        settings["video_download"] = global_data.COMMANDS["video_download"]
+        settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"]
+        return settings
+    elif download_data_type == "posts-videos":
+        settings["posts"] = global_data.FILES["posts"]
+        settings["post_ids"] = global_data.FILES["post_ids"]
+        settings["data_file"] = global_data.FILES["data_file"]
+        settings["post_download"] = global_data.COMMANDS["post_download"]
+        settings["videos"] = global_data.FILES["videos"]
+        settings["video_ids"] = global_data.FILES["video_ids"]
+        settings["video_download"] = global_data.COMMANDS["video_download"]
+        settings["number_of_videos"] = global_data.COMMANDS["number_of_videos"]
+        return settings
+    else:
+        print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.")
+        sys.exit()
+
+
+
+def get_posts(settings, tag):
+    file_path = file_methods.download_posts(settings, tag)
+    log = ()
+    if file_path:
+        new_data = data_methods.extract_posts(settings, file_path, tag)
+        if new_data:
+            data_file = os.path.join(settings["data"], tag, settings["posts"], settings["data_file"])
+            data_methods.update_posts(data_file, "file", new_data[1])
+            log = data_methods.update_posts(settings["post_ids"], "file", new_data[0], tag)
+        file_methods.delete_file(file_path, "file")
+    
+    return log
+
+
+
+def get_videos(settings, tag):    
+    log = ()
+    download_list = file_methods.download_videos(settings, tag)
+    if download_list:
+        new_data = data_methods.extract_videos(settings, tag, download_list)
+        if new_data:
+            log = data_methods.update_videos(settings, new_data, tag)
+        else:
+            file_methods.clean_video_files(settings, tag)
+    return log
+
+
+
+def get_data(hashtags, download_data_type):
+    counter = 0
+    total_hashtags = len(hashtags)
+    total_hashtags_offset = total_hashtags - 1
+    log_data = []
+    
+    if download_data_type == "posts":
+        settings = set_download_settings(download_data_type)
+        while counter < total_hashtags:
+            tag = hashtags[counter]
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"]), "dir")
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]), "file")
+            res = get_posts(settings, tag)
+            if res:
+                log = ( res[0], ( "posts", res[1] ) )
+                log_data.append(log)
+                data_methods.print_total(settings["post_ids"], tag, download_data_type)
+            
+            counter += 1
+            if counter < total_hashtags_offset:
+                time.sleep(settings["sleep"])
+    elif download_data_type == "videos":
+        settings = set_download_settings(download_data_type)
+        while counter < total_hashtags:
+            tag = hashtags[counter]
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["videos"]), "dir")
+            settings["videos_delete"] = settings['data'] + f"/{tag}/videos/#{tag}"
+            settings["videos_to"] = settings['data'] + f"/{tag}/videos"
+            res = get_videos(settings, tag)
+            if res:
+                res = ( res[0], ( "videos", res[1]))
+                log_data.append(res)
+                data_methods.print_total(settings["video_ids"], tag, download_data_type)
+ 
+            counter += 1
+            if counter < total_hashtags_offset:
+                time.sleep(settings["sleep"])
+    elif download_data_type == "posts-videos":
+        settings = set_download_settings(download_data_type)
+        while counter < total_hashtags:
+            tag = hashtags[counter]
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"]), "dir")
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["posts"], settings["data_file"]), "file")
+            file_methods.check_file(os.path.join(settings["data"], tag, settings["videos"]), "dir")
+            settings["videos_delete"] = settings['data'] + f"/{tag}/videos/#{tag}"
+            settings["videos_to"] = settings['data'] + f"/{tag}/videos"
+            requests = [("posts", "post_ids", get_posts), ("videos", "video_ids", get_videos)]
+            total_reqs_offset = len(requests) - 1
+            req_counter = 0
+            for req in requests:
+                res = req[2](settings, tag)
+                if res:
+                    res = ( res[0], (req[0], res[1]) )
+                    log_data.append(res)
+                    data_methods.print_total(settings[req[1]], tag, req[0])
+                
+                if req_counter < total_reqs_offset:
+                    time.sleep(settings["sleep"])
+                    req_counter += 1
+
+            counter += 1
+            if counter < total_hashtags_offset:
+                time.sleep(settings["sleep"])
+    else:
+        print(f"ERROR: The download_data_type must be either posts, videos or posts-videos.")
+        sys.exit()
+    return log_data
+
+
+def get_hashtags(file_name, hashtag_list):
+    try:
+        from hashtag_list import hashtag_list
+        return hashtag_list
+    except:
+        print(f"ERROR: something went wrong while reading the file {file_name}!")
+        raise
+
+
+if __name__ == "__main__":
+    parser = create_parser()
+    args = parser.parse_args()
+
+    if not (args.p or args.v):
+        parser.error("No argument given, please specify either -p for posts or -v videos or both.")
+        sys.exit()
+    
+    if args.h:
+        hashtags = args.h
+    else:
+        hashtags = get_hashtags("hashtag_list", "hashtag_list")
+
+    print(hashtags)
+    if not hashtags:
+        hashtags = get_hashtag_list()
+        if not hashtags:
+            print(f"ERROR: No hashtags found. Please re-run the script with at least one hashtag!!!")
+            sys.exit(0)
+
+    if (args.p and args.v):
+        download_data_type = "posts-videos"
+    elif args.p:
+        download_data_type = "posts"
+    else:
+        download_data_type = "videos"
+   
+    try: 
+        log_data = get_data(hashtags, download_data_type)
+        if log_data:
+            file_methods.log_writer(log_data)
+    except:
+        raise

From d6cb771f607f7e5392c96994fcdf33da50246d92 Mon Sep 17 00:00:00 2001
From: X <work@Xs-MacBook-Pro.local>
Date: Sun, 30 Jan 2022 13:59:43 +0100
Subject: [PATCH 22/23] delete old files

---
 data_processor.sh | 58 -----------------------------------------------
 1 file changed, 58 deletions(-)
 delete mode 100644 data_processor.sh

diff --git a/data_processor.sh b/data_processor.sh
deleted file mode 100644
index e0b4f66..0000000
--- a/data_processor.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-counter=0
-
-function join_lines {
-    local IFS="$1"
-    shift
-    echo "$*"
-}
-
-
-while IFS= read -r line || [ -n "$line" ]; do
-    if [[ -z ${line} ]];
-    then
-        :
-    elif [[ ${line: -1}  != '"' ]]; 
-    then
-	    to_combine[$counter]=$line
-        let "counter=counter+1"
-    elif [[ ${line: 0} != '"' && ${line: -1} == '"' ]];
-    then
-        to_combine[$counter]=$line
-        joined=$(join_lines " " "${to_combine[@]}")
-        #joined=$(join_lines " " "${to_combine[@]}" | tr -d "\n") # Mac sometimes introduces new lines, tr is used to remove newlines from joined.
-        echo "$joined" >> tmp.csv
-        unset to_combine
-        let "counter=0"
-    else
-        echo "$line" >> tmp.csv
-    fi
-done < "$1"
-
-while IFS= read -r line || [ -n "$line" ]; do
-    if [[ ${line: 0} == '"' ]];
-    then
-        if [[ -f anomalies_$1 ]];
-        then
-            echo "${line}" >> anomalies_$1
-        else
-            touch anomalies_$1
-            echo "${line}" >> anomalies_$1
-        fi
-    else
-        echo "${line}" >> clean-data_$1
-    fi
-done < tmp.csv
-
-rm -f tmp.csv
-
-if [[ ( -f "anomalies.csv") && ($(tr -d '\n\r\t' < anomalies.csv | wc -c) -eq 0) ]]; 
-then 
-    anmls=$(wc -l anomalies_$1 | awk '{print $1}')
-    echo "Anomalies found!!!!! ${anmls} lines of anomalies are recorded in anomalies_$1."
-else
-    input_data=$(wc -l $1 | awk '{print $1}')
-    clean_lines=$(wc -l clean-data_$1 | awk '{print $1}')
-    echo "${clean_lines} lines of clean data out of ${input_data} is recorded in clean-data_$1."
-fi

From bfa90676f121dd88e070dc134791a596a104e784 Mon Sep 17 00:00:00 2001
From: X <work@Xs-MacBook-Pro.local>
Date: Sun, 30 Jan 2022 14:00:37 +0100
Subject: [PATCH 23/23] delete old files

---
 extract_date.py           | 56 -------------------------
 extract_hashtag.py        | 87 ---------------------------------------
 extract_posts.py          | 49 ----------------------
 top_hashtag_occurances.py | 83 -------------------------------------
 4 files changed, 275 deletions(-)
 delete mode 100644 extract_date.py
 delete mode 100644 extract_hashtag.py
 delete mode 100644 extract_posts.py
 delete mode 100644 top_hashtag_occurances.py

diff --git a/extract_date.py b/extract_date.py
deleted file mode 100644
index 788cba7..0000000
--- a/extract_date.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-import datetime
-import collections
-import matplotlib.pyplot as plt
-import matplotlib.dates as mdates
-
-
-if len(sys.argv) < 3:
-    print(f'ERROR: Please make sure the command line has the following format: python3 extract_date.py hashtag_data.json hashtag')
-    sys.exit()
-
-
-def list_to_frequency(li):
-    if li and (type(li) == list):
-        return collections.Counter(li)
-    else:
-        print(f"ERROR: either {li} is empty or not a list.")
-
-
-def eligibility_check(obj):
-    if not obj:
-        print(f'ERROR: {obj} is empty!')
-        return False
-    elif type(obj) != int:
-        print(f'ERROR: {obj} is not an integer as is expected!')
-        return False
-    else:
-        return True
-
-with open(sys.argv[1]) as file:
-    object = json.load(file)
-    l = len(object)
-    date_list = []
-    for i in range(0, l):
-        obj = object[i]["createTime"]
-        if eligibility_check(obj):
-            dt_obj = datetime.datetime.fromtimestamp(obj)
-            date_list.append(dt_obj.date())
-        else:
-            print(f'ERROR: Some error occured. Check {obj}.')
-    ordered = dict(list_to_frequency(date_list))
-    dates = list(ordered.keys())
-    total_dates = len(dates)
-    frequency = list(ordered.values())
-    plt.scatter(dates, frequency)
-    plt.gcf().autofmt_xdate()
-    date_format = mdates.DateFormatter('%d-%m-%Y')
-    plt.gca().xaxis.set_major_formatter(date_format)
-    plt.tight_layout()
-    plt.title(f'Hashtag Lifecyle - #{sys.argv[2]}')
-    plt.xlabel(f'Dates ({total_dates} dates out of {l} posts)')
-    plt.ylabel('Posts')
-    plt.show()
diff --git a/extract_hashtag.py b/extract_hashtag.py
deleted file mode 100644
index 0b9e1fc..0000000
--- a/extract_hashtag.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import os, sys
-import csv, json
-import argparse
-import matplotlib.pyplot as plt
-
-
-
-def get_hashtags(obj):
-    if not obj:
-        print(f'ERROR: Empty item, no hashtags to be extracted.')
-        return
-    else:
-        hashtags = {}
-        l = len(obj)
-        for i in range(l):
-            for hashtag in obj[i]['hashtags']:
-                if hashtag['name'] in hashtags:
-                    hashtags[hashtag['name']].add(i)
-                else:
-                    hashtags[hashtag['name']] = {i}
-    return hashtags
-
-
-def get_occurances(filename, n=1 , sort=True):
-    with open(filename) as f:
-        obj = json.load(f)
-        l = len(obj)
-        tags = get_hashtags(obj)
-        tags = {key: (len(value), value) for (key, value) in tags.items()}
-        if not sort:
-            k = list(tags.keys())
-            v = list(tags.values())
-            return obj, k, v 
-        else:
-            sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)}
-            k = list(sorted_tags.keys())
-            v = list(sorted_tags.values())
-            k = k[:n]
-            v_total = [i[0] for i in v]
-            v_total = v_total[:n]
-            return l, k, v_total
-
-
-
-def plot(n, length, k, v):
-    plt.scatter(k, v)
-    plt.tight_layout()
-    plt.title(f'Hashtag Distribution')
-    plt.xlabel(f'Top {n} hashtags from {length} posts.')
-    plt.ylabel(f'Number of occurances')
-    plt.show()
-    return
-
-
-def print_occurances(k, v):
-    row_number = 0
-    print(f'Hashtag  Occurances')
-    for key,value in zip(k, v):
-        print(f'{row_number}\t{key}\t\t{value}')
-        row_number += 1
-    return
-
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("input_file", help="The json hashtag file name")
-    parser.add_argument("n", help="The number of top n occurances", type=int)
-    parser.add_argument("-p", "--plot", help="Plot the occurances", action="store_true")
-    parser.add_argument("-d", "--print", help="List top n hashtags", action="store_true")
-    args = parser.parse_args()
-    if args.input_file and args.n:
-        if args.n < 1:
-            print(f"Please make sure the number of top occurances is a positive integer.")
-            sys.exit()
-
-        base = os.path.splitext(args.input_file)[0]
-        path = f"./{base}_sorted_hashtags.csv"
-        if args.plot:
-            length, keys, values = get_occurances(args.input_file, args.n)
-            plot(args.n, length, keys, values)
-        else:
-            length, keys, values = get_occurances(args.input_file, args.n)
-            print_occurances(keys, values)
-    else:
-        print(f'ERROR: either {args.input_file} or {args.n} or both contains error.')
-            
diff --git a/extract_posts.py b/extract_posts.py
deleted file mode 100644
index 721393b..0000000
--- a/extract_posts.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import os, sys
-from extract_hashtag import get_occurances
-
-
-def filter_positions(hashtags, keys, positions):
-    filtered = []
-    for hashtag in hashtags:
-        try: 
-            i = keys.index(hashtag)
-            key = keys[i]
-            post_indices = positions[i][1]
-            filtered.append((key, post_indices))
-        except Exception as error:
-            print(error)
-            continue
-    return filtered
-
-
-def write_posts(path, obj, filtered):
-    length = len(filtered)
-    with open(path, "w") as output_file:
-        for i in range(length):
-            hashtag = filtered[i][0]
-            total_positions = len(filtered[i][1])
-            positions = list(filtered[i][1])
-            first_position = positions[0]
-            output_file.write(f"{hashtag}, {obj[first_position]}" + "\n")
-            for p in range(1, total_positions):
-                output_file.write(f" , {obj[positions[p]]}" + "\n")
-            print(f"{total_positions} posts written for the hashtag - {hashtag}")
-
-
-if __name__ == "__main__":
-    file_name = sys.argv[1]
-    hashtags = list(sys.argv[2:])
-    name = f"{hashtags[0]}_{len(hashtags)}"
-    path = f"../{name}_posts.csv"
-    if os.path.exists(path):
-        print(f'The file {path} containing hashtag occurances already exists. If you would like to run the script afresh, please delete the file {path} and re-run the script.')
-        sys.exit()
-    else:
-        obj, keys, positions = get_occurances(file_name, sort=False)
-        filtered = filter_positions(hashtags, keys, positions)
-        if filtered:
-            write_posts(path, obj, filtered)
-        else:
-            print(f"No posts found for the hashtags you entered.")
-        
-
diff --git a/top_hashtag_occurances.py b/top_hashtag_occurances.py
deleted file mode 100644
index dcee5aa..0000000
--- a/top_hashtag_occurances.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/python3
-
-import os, time
-import json
-import argparse
-from datetime import datetime
-
-
-def parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("hashtags", help="The hashtags to be processed", nargs="+")
-    parser.add_argument("top_n", help="Top n occurances for a hashtag", type=int)
-    args = parser.parse_args()
-    return args
-
-
-def check_file_existence(hashtag, contains=None):
-    pwd = "./"
-    for i in os.listdir(pwd):
-        #if os.path.isfile(os.path.join(pwd, i)) and hashtag in i:
-        if hashtag in i and contains in i:
-            return i
-        elif hashtag in i:
-            return i
-        else:
-            continue
-    return
-
-
-def get_input_file(hashtag):
-    check_file = check_file_existence(hashtag, "json")
-    if check_file:
-        return check_file
-    else:
-        try: 
-            os.system(f"tiktok-scraper hashtag {hashtag} -t json")
-            c = check_file_existence(hashtag, "json")
-            if c:
-                return c
-            else:
-                print(f"ERROR: No json file relating to {hashtag} found.")
-        except:
-            raise
-
-
-def copy_data(input_file, output_file):
-    os.system(f"cat {input_file} >> {output_file} && echo >> {output_file}")
-    return
-
-
-def get_data(hashtag, n):
-    input_file = get_input_file(hashtag)
-    if input_file:
-        os.system(f"python3 extract_hashtag.py {input_file} {str(n)} -o")
-        base = os.path.splitext(input_file)[0]
-        data_file = f"{base}_sorted_hashtags.csv"
-        if os.path.exists(data_file):
-            return data_file
-    return 
-
-
-def get_occurances(hashtag, n, output):
-    data_file = get_data(hashtag, n)
-    copy_data(data_file, output)
-    os.system(f"rm {data_file}")
-    print(f"{data_file} removed ....")
-
-
-if __name__ == "__main__":
-    args = parser()
-    hashtags = args.hashtags
-    now = datetime.now().strftime("%d%m%Y-%H%M%S")
-    output = f"./{now}.csv"
-    l = len(hashtags)
-    if l > 1:
-        sleep = 30 # Sleep time (in secs) between two tiktok scraping requests.
-        get_occurances(hashtags[0], args.top_n, output)
-        for i in range(1, l):
-            time.sleep(30)
-            get_occurances(hashtags[i], args.top_n, output)
-    else:
-        get_occurances(hashtags[0], args.top_n, output)
-    print(f"The output data is stored in the file {output}")