diff --git a/geocluster/cli.py b/geocluster/__main__.py similarity index 100% rename from geocluster/cli.py rename to geocluster/__main__.py diff --git a/geocluster/clustering.py b/geocluster/clustering.py index 3771b6e..dd2b011 100644 --- a/geocluster/clustering.py +++ b/geocluster/clustering.py @@ -50,5 +50,5 @@ def cluster_locations(df, algorithm, radius_km, min_cluster_size): n_jobs=-1, ) - X = np.radians(np.array(coordinates)) + X = np.radians(np.array(coordinates).astype(float)) return to_cluster_dict(df, clustering.fit(X)) diff --git a/geocluster/encoding.py b/geocluster/encoding.py index c4c410e..d9da123 100644 --- a/geocluster/encoding.py +++ b/geocluster/encoding.py @@ -64,8 +64,8 @@ class GeoJSONEncoder: "cluster_id": cluster_id, } - lon = props.pop("lon") - lat = props.pop("lat") + lon = float(props.pop("lon")) + lat = float(props.pop("lat")) point = geojson.Point((lon, lat)) self.state.append(geojson.Feature(geometry=point, properties=props)) diff --git a/geocluster/io.py b/geocluster/io.py index 466a338..d680125 100644 --- a/geocluster/io.py +++ b/geocluster/io.py @@ -4,11 +4,43 @@ from pkg_resources import resource_filename import json import json import pandas as pd +import numpy as np + + +def is_valid_lat(val: str) -> bool: + """Given a string, check if it corresponds to a valid decimal latitude value""" + try: + val = float(val) + return val >= -90 and val <= 90 + except: + return False + + +def is_valid_lon(val: str) -> bool: + """Given a string, check if it corresponds to a valid decimal longitude value""" + try: + val = float(val) + return val >= -180 and val <= 180 + except: + return False def read_csv_file(filename): """Read input csv file, dropping rows that don't have valid location data.""" - return pd.read_csv(filename).dropna(subset=["lat", "lon"]) + df = pd.read_csv(filename) + initial_rows = len(df) + + df = df.dropna(subset=["lat", "lon"]) + df.replace({np.nan: None}) # replace for other fields not to break kepler parsing + print(f"Ignored {initial_rows - len(df)} coordinates with NaN") + + valid_index = df.lat.astype(str).apply(is_valid_lat) & df.lon.astype(str).apply( + is_valid_lon + ) + if len(df_invalid := df[~valid_index]): + print(f"Found {len(df_invalid)} invalid coordinate pairs, ignoring:") + print(df_invalid[["lat", "lon"]].to_string()) + return df[valid_index] def ensure_file_path(dirname, filename):