From 5c5833421e22e1c03b96b0cb99269b2c29d75f6a Mon Sep 17 00:00:00 2001
From: jowi-tech <72805812+jowi-tech@users.noreply.github.com>
Date: Tue, 20 Jul 2021 22:18:15 +0200
Subject: [PATCH] Add files via upload

---
 data_processor.sh | 58 +++++++++++++++++++++++++++++++++++++++++++++++
 extract_date.py   | 56 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 data_processor.sh
 create mode 100644 extract_date.py

diff --git a/data_processor.sh b/data_processor.sh
new file mode 100644
index 0000000..e0b4f66
--- /dev/null
+++ b/data_processor.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+counter=0
+
+function join_lines {
+    local IFS="$1"
+    shift
+    echo "$*"
+}
+
+
+while IFS= read -r line || [ -n "$line" ]; do
+    if [[ -z ${line} ]];
+    then
+        :
+    elif [[ ${line: -1}  != '"' ]]; 
+    then
+	    to_combine[$counter]=$line
+        let "counter=counter+1"
+    elif [[ ${line: 0} != '"' && ${line: -1} == '"' ]];
+    then
+        to_combine[$counter]=$line
+        joined=$(join_lines " " "${to_combine[@]}")
+        #joined=$(join_lines " " "${to_combine[@]}" | tr -d "\n") # Mac sometimes introduces new lines, tr is used to remove newlines from joined.
+        echo "$joined" >> tmp.csv
+        unset to_combine
+        let "counter=0"
+    else
+        echo "$line" >> tmp.csv
+    fi
+done < "$1"
+
+while IFS= read -r line || [ -n "$line" ]; do
+    if [[ ${line: 0} == '"' ]];
+    then
+        if [[ -f anomalies_$1 ]];
+        then
+            echo "${line}" >> anomalies_$1
+        else
+            touch anomalies_$1
+            echo "${line}" >> anomalies_$1
+        fi
+    else
+        echo "${line}" >> clean-data_$1
+    fi
+done < tmp.csv
+
+rm -f tmp.csv
+
+if [[ ( -f "anomalies.csv") && ($(tr -d '\n\r\t' < anomalies.csv | wc -c) -eq 0) ]]; 
+then 
+    anmls=$(wc -l anomalies_$1 | awk '{print $1}')
+    echo "Anomalies found!!!!! ${anmls} lines of anomalies are recorded in anomalies_$1."
+else
+    input_data=$(wc -l $1 | awk '{print $1}')
+    clean_lines=$(wc -l clean-data_$1 | awk '{print $1}')
+    echo "${clean_lines} lines of clean data out of ${input_data} is recorded in clean-data_$1."
+fi
diff --git a/extract_date.py b/extract_date.py
new file mode 100644
index 0000000..788cba7
--- /dev/null
+++ b/extract_date.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import datetime
+import collections
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+
+
+if len(sys.argv) < 3:
+    print(f'ERROR: Please make sure the command line has the following format: python3 extract_date.py hashtag_data.json hashtag')
+    sys.exit()
+
+
+def list_to_frequency(li):
+    if li and (type(li) == list):
+        return collections.Counter(li)
+    else:
+        print(f"ERROR: either {li} is empty or not a list.")
+
+
+def eligibility_check(obj):
+    if not obj:
+        print(f'ERROR: {obj} is empty!')
+        return False
+    elif type(obj) != int:
+        print(f'ERROR: {obj} is not an integer as is expected!')
+        return False
+    else:
+        return True
+
+with open(sys.argv[1]) as file:
+    object = json.load(file)
+    l = len(object)
+    date_list = []
+    for i in range(0, l):
+        obj = object[i]["createTime"]
+        if eligibility_check(obj):
+            dt_obj = datetime.datetime.fromtimestamp(obj)
+            date_list.append(dt_obj.date())
+        else:
+            print(f'ERROR: Some error occured. Check {obj}.')
+    ordered = dict(list_to_frequency(date_list))
+    dates = list(ordered.keys())
+    total_dates = len(dates)
+    frequency = list(ordered.values())
+    plt.scatter(dates, frequency)
+    plt.gcf().autofmt_xdate()
+    date_format = mdates.DateFormatter('%d-%m-%Y')
+    plt.gca().xaxis.set_major_formatter(date_format)
+    plt.tight_layout()
+    plt.title(f'Hashtag Lifecyle - #{sys.argv[2]}')
+    plt.xlabel(f'Dates ({total_dates} dates out of {l} posts)')
+    plt.ylabel('Posts')
+    plt.show()