From 5c5833421e22e1c03b96b0cb99269b2c29d75f6a Mon Sep 17 00:00:00 2001 From: jowi-tech <72805812+jowi-tech@users.noreply.github.com> Date: Tue, 20 Jul 2021 22:18:15 +0200 Subject: [PATCH] Add files via upload --- data_processor.sh | 58 +++++++++++++++++++++++++++++++++++++++++++++++ extract_date.py | 56 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 data_processor.sh create mode 100644 extract_date.py diff --git a/data_processor.sh b/data_processor.sh new file mode 100644 index 0000000..e0b4f66 --- /dev/null +++ b/data_processor.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +counter=0 + +function join_lines { + local IFS="$1" + shift + echo "$*" +} + + +while IFS= read -r line || [ -n "$line" ]; do + if [[ -z ${line} ]]; + then + : + elif [[ ${line: -1} != '"' ]]; + then + to_combine[$counter]=$line + let "counter=counter+1" + elif [[ ${line: 0} != '"' && ${line: -1} == '"' ]]; + then + to_combine[$counter]=$line + joined=$(join_lines " " "${to_combine[@]}") + #joined=$(join_lines " " "${to_combine[@]}" | tr -d "\n") # Mac sometimes introduces new lines, tr is used to remove newlines from joined. + echo "$joined" >> tmp.csv + unset to_combine + let "counter=0" + else + echo "$line" >> tmp.csv + fi +done < "$1" + +while IFS= read -r line || [ -n "$line" ]; do + if [[ ${line: 0} == '"' ]]; + then + if [[ -f anomalies_$1 ]]; + then + echo "${line}" >> anomalies_$1 + else + touch anomalies_$1 + echo "${line}" >> anomalies_$1 + fi + else + echo "${line}" >> clean-data_$1 + fi +done < tmp.csv + +rm -f tmp.csv + +if [[ ( -f "anomalies.csv") && ($(tr -d '\n\r\t' < anomalies.csv | wc -c) -eq 0) ]]; +then + anmls=$(wc -l anomalies_$1 | awk '{print $1}') + echo "Anomalies found!!!!! ${anmls} lines of anomalies are recorded in anomalies_$1." +else + input_data=$(wc -l $1 | awk '{print $1}') + clean_lines=$(wc -l clean-data_$1 | awk '{print $1}') + echo "${clean_lines} lines of clean data out of ${input_data} is recorded in clean-data_$1." +fi diff --git a/extract_date.py b/extract_date.py new file mode 100644 index 0000000..788cba7 --- /dev/null +++ b/extract_date.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + +import sys +import json +import datetime +import collections +import matplotlib.pyplot as plt +import matplotlib.dates as mdates + + +if len(sys.argv) < 3: + print(f'ERROR: Please make sure the command line has the following format: python3 extract_date.py hashtag_data.json hashtag') + sys.exit() + + +def list_to_frequency(li): + if li and (type(li) == list): + return collections.Counter(li) + else: + print(f"ERROR: either {li} is empty or not a list.") + + +def eligibility_check(obj): + if not obj: + print(f'ERROR: {obj} is empty!') + return False + elif type(obj) != int: + print(f'ERROR: {obj} is not an integer as is expected!') + return False + else: + return True + +with open(sys.argv[1]) as file: + object = json.load(file) + l = len(object) + date_list = [] + for i in range(0, l): + obj = object[i]["createTime"] + if eligibility_check(obj): + dt_obj = datetime.datetime.fromtimestamp(obj) + date_list.append(dt_obj.date()) + else: + print(f'ERROR: Some error occured. Check {obj}.') + ordered = dict(list_to_frequency(date_list)) + dates = list(ordered.keys()) + total_dates = len(dates) + frequency = list(ordered.values()) + plt.scatter(dates, frequency) + plt.gcf().autofmt_xdate() + date_format = mdates.DateFormatter('%d-%m-%Y') + plt.gca().xaxis.set_major_formatter(date_format) + plt.tight_layout() + plt.title(f'Hashtag Lifecyle - #{sys.argv[2]}') + plt.xlabel(f'Dates ({total_dates} dates out of {l} posts)') + plt.ylabel('Posts') + plt.show()