mirror of
https://github.com/bellingcat/geoclustering.git
synced 2026-06-08 03:28:30 +03:00
feat: fixes after testing with noisier data (#2)
* print a warning to console when invalid coordinates are present in dataset. * rename cli to __main__ to allow calling as a python module * move data validation to `io` module
This commit is contained in:
committed by
Felix Spöttel
parent
50f8a872e6
commit
729c2e49bf
@@ -50,5 +50,5 @@ def cluster_locations(df, algorithm, radius_km, min_cluster_size):
|
|||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
)
|
)
|
||||||
|
|
||||||
X = np.radians(np.array(coordinates))
|
X = np.radians(np.array(coordinates).astype(float))
|
||||||
return to_cluster_dict(df, clustering.fit(X))
|
return to_cluster_dict(df, clustering.fit(X))
|
||||||
|
|||||||
@@ -64,8 +64,8 @@ class GeoJSONEncoder:
|
|||||||
"cluster_id": cluster_id,
|
"cluster_id": cluster_id,
|
||||||
}
|
}
|
||||||
|
|
||||||
lon = props.pop("lon")
|
lon = float(props.pop("lon"))
|
||||||
lat = props.pop("lat")
|
lat = float(props.pop("lat"))
|
||||||
|
|
||||||
point = geojson.Point((lon, lat))
|
point = geojson.Point((lon, lat))
|
||||||
self.state.append(geojson.Feature(geometry=point, properties=props))
|
self.state.append(geojson.Feature(geometry=point, properties=props))
|
||||||
|
|||||||
@@ -4,11 +4,43 @@ from pkg_resources import resource_filename
|
|||||||
import json
|
import json
|
||||||
import json
|
import json
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_lat(val: str) -> bool:
|
||||||
|
"""Given a string, check if it corresponds to a valid decimal latitude value"""
|
||||||
|
try:
|
||||||
|
val = float(val)
|
||||||
|
return val >= -90 and val <= 90
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_lon(val: str) -> bool:
|
||||||
|
"""Given a string, check if it corresponds to a valid decimal longitude value"""
|
||||||
|
try:
|
||||||
|
val = float(val)
|
||||||
|
return val >= -180 and val <= 180
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def read_csv_file(filename):
|
def read_csv_file(filename):
|
||||||
"""Read input csv file, dropping rows that don't have valid location data."""
|
"""Read input csv file, dropping rows that don't have valid location data."""
|
||||||
return pd.read_csv(filename).dropna(subset=["lat", "lon"])
|
df = pd.read_csv(filename)
|
||||||
|
initial_rows = len(df)
|
||||||
|
|
||||||
|
df = df.dropna(subset=["lat", "lon"])
|
||||||
|
df.replace({np.nan: None}) # replace for other fields not to break kepler parsing
|
||||||
|
print(f"Ignored {initial_rows - len(df)} coordinates with NaN")
|
||||||
|
|
||||||
|
valid_index = df.lat.astype(str).apply(is_valid_lat) & df.lon.astype(str).apply(
|
||||||
|
is_valid_lon
|
||||||
|
)
|
||||||
|
if len(df_invalid := df[~valid_index]):
|
||||||
|
print(f"Found {len(df_invalid)} invalid coordinate pairs, ignoring:")
|
||||||
|
print(df_invalid[["lat", "lon"]].to_string())
|
||||||
|
return df[valid_index]
|
||||||
|
|
||||||
|
|
||||||
def ensure_file_path(dirname, filename):
|
def ensure_file_path(dirname, filename):
|
||||||
|
|||||||
Reference in New Issue
Block a user