From 729c2e49bf0330adcd374553be31a80955cac1ee Mon Sep 17 00:00:00 2001 From: Miguel Sozinho Ramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 30 Jun 2022 12:13:02 +0100 Subject: [PATCH] feat: fixes after testing with noisier data (#2) * print a warning to console when invalid coordinates are present in dataset. * rename cli to __main__ to allow calling as a python module * move data validation to `io` module --- geocluster/{cli.py => __main__.py} | 0 geocluster/clustering.py | 2 +- geocluster/encoding.py | 4 ++-- geocluster/io.py | 34 +++++++++++++++++++++++++++++- 4 files changed, 36 insertions(+), 4 deletions(-) rename geocluster/{cli.py => __main__.py} (100%) diff --git a/geocluster/cli.py b/geocluster/__main__.py similarity index 100% rename from geocluster/cli.py rename to geocluster/__main__.py diff --git a/geocluster/clustering.py b/geocluster/clustering.py index 3771b6e..dd2b011 100644 --- a/geocluster/clustering.py +++ b/geocluster/clustering.py @@ -50,5 +50,5 @@ def cluster_locations(df, algorithm, radius_km, min_cluster_size): n_jobs=-1, ) - X = np.radians(np.array(coordinates)) + X = np.radians(np.array(coordinates).astype(float)) return to_cluster_dict(df, clustering.fit(X)) diff --git a/geocluster/encoding.py b/geocluster/encoding.py index c4c410e..d9da123 100644 --- a/geocluster/encoding.py +++ b/geocluster/encoding.py @@ -64,8 +64,8 @@ class GeoJSONEncoder: "cluster_id": cluster_id, } - lon = props.pop("lon") - lat = props.pop("lat") + lon = float(props.pop("lon")) + lat = float(props.pop("lat")) point = geojson.Point((lon, lat)) self.state.append(geojson.Feature(geometry=point, properties=props)) diff --git a/geocluster/io.py b/geocluster/io.py index 466a338..d680125 100644 --- a/geocluster/io.py +++ b/geocluster/io.py @@ -4,11 +4,43 @@ from pkg_resources import resource_filename import json import json import pandas as pd +import numpy as np + + +def is_valid_lat(val: str) -> bool: + """Given a string, check if it corresponds to a valid decimal latitude value""" + try: + val = float(val) + return val >= -90 and val <= 90 + except: + return False + + +def is_valid_lon(val: str) -> bool: + """Given a string, check if it corresponds to a valid decimal longitude value""" + try: + val = float(val) + return val >= -180 and val <= 180 + except: + return False def read_csv_file(filename): """Read input csv file, dropping rows that don't have valid location data.""" - return pd.read_csv(filename).dropna(subset=["lat", "lon"]) + df = pd.read_csv(filename) + initial_rows = len(df) + + df = df.dropna(subset=["lat", "lon"]) + df.replace({np.nan: None}) # replace for other fields not to break kepler parsing + print(f"Ignored {initial_rows - len(df)} coordinates with NaN") + + valid_index = df.lat.astype(str).apply(is_valid_lat) & df.lon.astype(str).apply( + is_valid_lon + ) + if len(df_invalid := df[~valid_index]): + print(f"Found {len(df_invalid)} invalid coordinate pairs, ignoring:") + print(df_invalid[["lat", "lon"]].to_string()) + return df[valid_index] def ensure_file_path(dirname, filename):