migrate gh artifact actions to v4 (#20 )

* migrate gh artifact actions to v4 from migration guide no breaking changes apply here. * updates pipfile.lock dependency versions * updates CI due to pytest issue see https://github.com/scipy/scipy/issues/22236 * bump to python 3.12 * revert to py3.10
Bump version to v0.4.1 for release
2026-06-09 20:18:30 +03:00 · 2025-01-09 15:47:27 +00:00 · 2022-09-27 14:49:17 +01:00 · 2022-09-27 14:49:04 +01:00 · 2022-09-27 14:43:05 +01:00 · 2022-09-27 14:41:48 +01:00
33 changed files with 4125 additions and 1241 deletions
--- a/.github/actions/setup-venv/action.yml
+++ b/.github/actions/setup-venv/action.yml
@@ -0,0 +1,53 @@
+name: Python virtualenv
+description: Set up a Python virtual environment with caching
+inputs:
+  python-version:
+    description: The Python version to use
+    required: true
+  cache-prefix:
+    description: Update this to invalidate the cache
+    required: true
+    default: v0
+runs:
+  using: composite
+  steps:
+    - name: Setup Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ inputs.python-version }}
+
+    - shell: bash
+      run: |
+        # Install prerequisites.
+        pip install --upgrade pip setuptools wheel virtualenv
+
+    - shell: bash
+      run: |
+        # Get the exact Python version to use in the cache key.
+        echo "PYTHON_VERSION=$(python --version)" >> $GITHUB_ENV
+
+    - uses: actions/cache@v2
+      id: virtualenv-cache
+      with:
+        path: .venv
+        key: ${{ inputs.cache-prefix }}-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('Pipfile.lock') }}
+    - if: steps.virtualenv-cache.outputs.cache-hit != 'true'
+      shell: bash
+      run: |
+        # Set up virtual environment without cache hit.
+        test -d .venv || virtualenv -p $(which python) --copies --reset-app-data .venv
+        . .venv/bin/activate
+        pip install -e .[dev]
+
+    - if: steps.virtualenv-cache.outputs.cache-hit == 'true'
+      shell: bash
+      run: |
+        # Set up virtual environment from cache hit.
+        . .venv/bin/activate
+        pip install --no-deps -e .[dev]
+
+    - shell: bash
+      run: |
+        # Show environment info.
+        . .venv/bin/activate
+        echo "✓ Installed $(python --version) virtual environment to $(which python)"
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,10 +0,0 @@
-name: Lint
-
-on: [push]
-
-jobs:
-  black:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: psf/black@stable
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -0,0 +1,119 @@
+name: Main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+# on: [push]
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+    tags:
+      - "v*.*.*"
+
+env:
+  # Change this to invalidate existing cache.
+  CACHE_PREFIX: v0
+  PYTHONPATH: ./
+
+jobs:
+  checks:
+    name: Python ${{ matrix.python }} - ${{ matrix.task.name }}
+    runs-on: [ubuntu-latest]
+    timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - python: "3.10"
+            task:
+              name: "Build"
+              run: |
+                python setup.py check
+                python setup.py bdist_wheel sdist
+          - python: "3.10"
+            task:
+              name: "Lint"
+              run: |
+                black --check .
+          - python: "3.10"
+            task:
+              name: "Test"
+              run: pytest --exitfirst --failed-first --assert=plain
+          - python: "3.8"
+            task:
+              name: "Test (3.8)"
+              run: pytest --exitfirst --failed-first --assert=plain
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python environment
+        uses: ./.github/actions/setup-venv
+        with:
+          python-version: ${{ matrix.python }}
+          cache-prefix: ${{ env.CACHE_PREFIX }}
+
+      - name: ${{ matrix.task.name }}
+        run: |
+          . .venv/bin/activate
+          ${{ matrix.task.run }}
+
+      - name: Upload package distribution files
+        if: matrix.task.name == 'Build'
+        uses: actions/upload-artifact@v4
+        with:
+          name: package
+          path: dist
+
+      - name: Clean up
+        if: always()
+        run: |
+          . .venv/bin/activate
+          pip uninstall -y geoclustering
+
+  release:
+    name: Release
+    runs-on: ubuntu-latest
+    needs: [checks]
+    if: startsWith(github.ref, 'refs/tags/')
+    steps:
+      - uses: actions/checkout@v1
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install requirements
+        run: |
+          pip install --upgrade pip setuptools wheel "twine>=1.11.0"
+
+      - name: Prepare environment
+        run: |
+          echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV
+          echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
+
+      - name: Download package distribution files
+        uses: actions/download-artifact@v4
+        with:
+          name: package
+          path: dist
+
+      - name: Publish package to PyPI
+        run: |
+          twine upload -u '${{ secrets.PYPI_USERNAME }}' -p '${{ secrets.PYPI_PASSWORD }}' dist/*
+
+      - name: Publish GitHub release
+        uses: softprops/action-gh-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          # body_path: ${{ github.workspace }}-RELEASE_NOTES.md
+          prerelease: ${{ contains(env.TAG, 'rc') }}
+          files: |
+            dist/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,10 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+      - id: black
+        # It is recommended to specify the latest version of Python
+        # supported by your project here, or alternatively use
+        # pre-commit's default_language_version, see
+        # https://pre-commit.com/#top_level-default_language_version
+        language_version: python3.9
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2022, Felix Spöttel
+Copyright (c) 2022, Stichting Bellingcat

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/22
+++ b/22
@@ -0,0 +1,22 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+click = "*"
+geojson = "*"
+keplergl = "*"
+numpy = "*"
+pandas = "*"
+scikit-learn = "*"
+
+[dev-packages]
+black = "*"
+pre-commit = "*"
+pytest = "*"
+wheel = "*"
+geoclustering = {editable = true, path = "."}
+
+[requires]
+python_version = "3.9"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# geocluster
+# geoclustering

 > 📍 command-line tool for clustering geolocations.

@@ -10,34 +10,38 @@

 ### Clustering Method

-A cluster is created when a certain number of points (=> `--size`) each are within a given distance (=> `--distance`) of at least one other point in the cluster. 
+A cluster is created when a certain number of points (defined with `--size`) each are within a given distance (defined with `--distance`) of at least one other point in the cluster. 


 ## Install

-Clone the repository:
+Install with pip:

 ```sh
-git clone https://github.com/fspoettel/geocluster
-cd geocluster
+# with kepler.gl visualization support
+pip install geoclustering[full]
+
+# only text-based output
+pip install geoclustering
 ```

-Install keplergl build dependencies:
+If the `full` install fails, you might need to install kepler.gl build dependencies:

 ```sh
 # macos
 brew install proj gdal
 ```

-Install project with pip:
-```sh
-pip install .
-```
-
 ## Usage

 ```
-Usage: geocluster [OPTIONS] FILENAME
+Usage: geoclustering [OPTIONS] FILENAME
+
+  Tool to cluster geolocations. A cluster is created when a certain number of
+  points (defined with --size) each are within a given distance (defined with
+  --distance) of at least one other point in the cluster. Input is supplied as
+  a csv file. At a minimum, each row needs to have a 'lat' and a 'lon' column.
+  Other rows are reflected to the output.

 Options:
  -d, --distance FLOAT            (in km) Max. distance between two points in
@@ -50,12 +54,15 @@ Options:
                                  Clustering algorithm to be used. `optics`
                                  produces tighter clusters but is slower.
                                  Default: dbscan
+  --open                          Open the generated visualization in the
+                                  default browser automatically.
+  --debug                         Print debug output.
  --help                          Show this message and exit.
 ```

 ## Input

-Inputs are supplied as a `.csv` file. The only required fields are `lat` and `lon`, all other fields are reflected to the output.
+Inputs are supplied as a `.csv` file. At a minimum, each row needs to have a `lat` and a `lon`` column. Other rows are reflected to the output.

 ```csv
 id,name,lat,lon
@@ -65,7 +72,7 @@ id,name,lat,lon

 ## Output

-If at least one cluster was found, the tool outputs a folder with `json`, `geojson`, `text` and a kepler.gl `html` files.
+If at least one cluster was found, the tool outputs a folder with output as `json`, `geojson`, `txt`, `csv` files. A kepler.gl `html` file is generated as well.

 ### JSON

@@ -114,7 +121,7 @@ Encodes a single `FeatureCollection`, containing all points as `Feature` objects
 }
 ```

-### txt
+### Text

 Encodes cluster as blocks separated by a newline, where each line in a cluster block contains one point.

@@ -125,6 +132,39 @@ id 9, name Rosanna Foggo, lat -6.2074293, lon 106.8915948
 // ...
 ```

+### CSV
+
+Encodes each event in one line with `cluster_id` information associated.
+
+```csv
+cluster_id,name,lat,lon
+9,Rosanna Foggo,-6.2074293,106.8915948
+...
+```
+
 ### kepler.gl

 ![kepler.gl instance](https://user-images.githubusercontent.com/1682504/176478177-c0446b51-4060-495c-803d-79e2bbd3e966.png)
+
+## Develop
+
+It is assumed that you are using **Python3.9+**. It is encouraged to [setup a virtualenv](https://wiki.archlinux.org/title/Python/Virtual_environment#venv>) for development.
+
+```sh
+    # install dependencies & dev-dependencies
+    # PIP
+    pip install -e .[dev,full]
+    # PIPENV
+    pipenv install --dev -e .
+
+    # install a git hook that runs the code formatter before each commit.
+    pre-commit install
+```
+
+We use [Black](https://github.com/psf/black) as our code formatter. If you don't want to use the `pre-commit` hook, you can run the formatter manually or via an editor plugin.
+
+## Release
+
+1. Update [version.py](geoclustering/version.py)
+2. Run `scripts/release.sh` 
+3. Confirm GH action completed successfully
--- a/geocluster/main.py
+++ b/geocluster/main.py
@@ -1,64 +0,0 @@
-import click
-import webbrowser
-
-import geocluster.clustering as clustering
-import geocluster.encoding as encoding
-import geocluster.io as io
-
-
-@click.command()
-@click.option(
-    "--distance",
-    "-d",
-    type=click.FLOAT,
-    required=True,
-    help="(in km) Max. distance between two points in a cluster.",
-)
-@click.option(
-    "--size",
-    "-s",
-    type=click.INT,
-    required=True,
-    help="Min. number of points in a cluster.",
-)
-@click.option(
-    "--output",
-    "-o",
-    type=click.Path(exists=False),
-    default="output",
-    help="Output directory for results. Default: ./output",
-)
-@click.option(
-    "--algorithm",
-    "-a",
-    type=click.Choice(
-        ["dbscan", "optics"],
-        case_sensitive=False,
-    ),
-    default="dbscan",
-    help="Clustering algorithm to be used. `optics` produces tighter clusters but is slower. Default: dbscan",
-)
-@click.argument("filename", type=click.Path(exists=True))
-def main(distance, size, output, filename, algorithm):
-    df = io.read_csv_file(filename)
-
-    clusters = clustering.cluster_locations(
-        df=df, algorithm=algorithm, radius_km=distance, min_cluster_size=size
-    )
-
-    if not bool(clusters):
-        click.echo("Did not find clusters matching input parameters.")
-        return
-
-    encoded = encoding.encode_clusters(clusters)
-
-    io.write_output_file(output, "result.txt", encoded["string"])
-    io.write_output_file(output, "result.json", encoded["json"])
-    io.write_output_file(output, "result.geojson", encoded["geojson"])
-    vis = io.write_visualization(output, "result.html", encoded["geojson"])
-
-    webbrowser.open_new_tab("file://" + str(vis.absolute()))
-
-
-if __name__ == "__main__":
-    main()
--- a/geocluster/io.py
+++ b/geocluster/io.py
@@ -1,78 +0,0 @@
-from keplergl import KeplerGl
-from pathlib import Path
-from pkg_resources import resource_filename
-import json
-import json
-import pandas as pd
-import numpy as np
-
-
-def is_valid_lat(val: str) -> bool:
-    """Given a string, check if it corresponds to a valid decimal latitude value"""
-    try:
-        val = float(val)
-        return val >= -90 and val <= 90
-    except:
-        return False
-
-
-def is_valid_lon(val: str) -> bool:
-    """Given a string, check if it corresponds to a valid decimal longitude value"""
-    try:
-        val = float(val)
-        return val >= -180 and val <= 180
-    except:
-        return False
-
-
-def read_csv_file(filename):
-    """Read input csv file, dropping rows that don't have valid location data."""
-    df = pd.read_csv(filename)
-    initial_rows = len(df)
-
-    df = df.dropna(subset=["lat", "lon"])
-    df = df.replace(
-        {np.nan: None}
-    )  # replace for other fields not to break kepler parsing
-    print(f"Ignored {initial_rows - len(df)} coordinates with NaN")
-
-    valid_index = df.lat.astype(str).apply(is_valid_lat) & df.lon.astype(str).apply(
-        is_valid_lon
-    )
-    if len(df_invalid := df[~valid_index]):
-        print(f"Found {len(df_invalid)} invalid coordinate pairs, ignoring:")
-        print(df_invalid[["lat", "lon"]].to_string())
-    return df[valid_index]
-
-
-def ensure_file_path(dirname, filename):
-    """Ensure a parent directory exists for a file."""
-    path = Path(dirname)
-    path.mkdir(parents=True, exist_ok=True)
-    return path / filename
-
-
-def write_output_file(dirname, filename, data):
-    """Write a file, ensuring parent directories."""
-    filepath = ensure_file_path(dirname, filename)
-
-    with open(filepath, "w") as f:
-        f.write(data)
-
-    return filepath
-
-
-def write_visualization(dirname, filename, data):
-    """Write a visualization, ensuring parent directories."""
-    map = KeplerGl()
-    map.add_data(data=data, name="clusters")
-
-    # config configures a default color scheme for our clusters layer.
-    config_file = resource_filename("geocluster", "kepler_config.json")
-    with open(config_file) as f:
-        map.config = json.loads(f.read())
-
-    filepath = ensure_file_path(dirname, filename)
-    map.save_to_html(file_name=str(filepath), center_map=True)
-
-    return filepath
--- a/geoclustering/init.py
+++ b/geoclustering/init.py
--- a/geoclustering/main.py
+++ b/geoclustering/main.py
@@ -0,0 +1,96 @@
+from pathlib import Path
+import click
+import webbrowser
+
+import geoclustering.clustering as clustering
+import geoclustering.encoding as encoding
+import geoclustering.io as io
+
+
+@click.command(
+    help="Tool to cluster geolocations. A cluster is created when a certain number of points (defined with --size) each are within a given distance (defined with --distance) of at least one other point in the cluster. Input is supplied as a csv file. At a minimum, each row needs to have a 'lat' and a 'lon' column. Other rows are reflected to the output."
+)
+@click.option(
+    "--distance",
+    "-d",
+    type=click.FLOAT,
+    required=True,
+    help="(in km) Max. distance between two points in a cluster.",
+)
+@click.option(
+    "--size",
+    "-s",
+    type=click.INT,
+    required=True,
+    help="Min. number of points in a cluster.",
+)
+@click.option(
+    "--output",
+    "-o",
+    type=click.Path(exists=False),
+    default="output",
+    help="Output directory for results. Default: ./output",
+)
+@click.option(
+    "--algorithm",
+    "-a",
+    type=click.Choice(
+        ["dbscan", "optics"],
+        case_sensitive=False,
+    ),
+    default="dbscan",
+    help="Clustering algorithm to be used. `optics` produces tighter clusters but is slower. Default: dbscan",
+)
+@click.option(
+    "--open",
+    "_open",
+    is_flag=True,
+    help="Open the generated visualization in the default browser automatically.",
+)
+@click.option("--debug", is_flag=True, help="Print debug output.")
+@click.argument("filename", type=click.Path(exists=True))
+def main(distance, size, output, filename, algorithm, _open, debug):
+    def print_debug(s):
+        if debug:
+            click.secho(s, fg="bright_black")
+
+    df = io.read_csv_file(filename)
+    print_debug(f"Read {len(df)} valid coordinates from {Path(filename).absolute()}")
+
+    clusters = clustering.cluster_locations(
+        df=df, algorithm=algorithm, radius_km=distance, min_cluster_size=size
+    )
+
+    if not bool(clusters):
+        click.secho("Did not find clusters matching input parameters.", fg="yellow")
+        return
+
+    print_debug(f"Found {len(clusters)} valid clusters using {algorithm}")
+
+    encoded = encoding.encode_clusters(clusters)
+    io.write_output_file(output, "result.txt", encoded["string"])
+    io.write_output_file(output, "result.json", encoded["json"])
+    io.write_output_file(output, "result.geojson", encoded["geojson"])
+    io.write_output_file(output, "result.csv", encoded["csv"])
+
+    vis = io.write_visualization(output, "result.html", encoded["geojson"])
+    if vis is None:
+        print_debug("Skipped generating visualization: kepler is not installed.")
+
+    click.echo(f"Output files saved to {Path(output).absolute()}")
+
+    if _open:
+        if vis:
+            webbrowser.open_new_tab("file://" + str(vis.absolute()))
+            print_debug("Opened visualization in default browser.")
+        else:
+            click.secho(
+                "Can't open kepler.gl: package not installed. Please re-install geoclustering with `pip install geoclustering[full]`.",
+                fg="yellow",
+            )
+
+    click.secho("Clustering completed.", fg="green")
+
+
+if __name__ == "__main__":
+    main()
--- a/geoclustering/clustering.py
+++ b/geoclustering/clustering.py
@@ -14,8 +14,6 @@ def to_cluster_dict(df, clustering):
    """
    clusters_by_id = {}

-    print(clustering.labels_)
-
    for idx, cluster_id in enumerate(clustering.labels_):
        # ignore "noise" locations that don't belong to any cluster.
        if cluster_id > -1:
--- a/geoclustering/encoding.py
+++ b/geoclustering/encoding.py
@@ -1,6 +1,8 @@
 import json
 import numpy as np
 import geojson
+import csv
+import io  # not io.py


 class NpEncoder(json.JSONEncoder):
@@ -47,7 +49,7 @@ class JSONEncoder:

        for record in cluster:
            cluster_data["points"].append(record)
-            self.state.append(cluster_data)
+        self.state.append(cluster_data)

    def get(self):
        return json.dumps(self.state, cls=NpEncoder)
@@ -74,13 +76,37 @@ class GeoJSONEncoder:
        return json.dumps(geojson.FeatureCollection(self.state), cls=NpEncoder)


+class CSVEncoder:
+    """Encodes clustering result as a CSV"""
+
+    def __init__(self):
+        self.state = io.StringIO()
+        self.writer = False
+
+    def visitor(self, cluster_id, cluster):
+        if not self.writer:
+            self.writer = csv.DictWriter(
+                self.state,
+                fieldnames=["cluster_id"] + list(cluster[0].keys()),
+                quoting=csv.QUOTE_NONNUMERIC,
+                lineterminator="\n",
+            )
+            self.writer.writeheader()
+
+        for record in cluster:
+            self.writer.writerow({**record, "cluster_id": cluster_id})
+
+    def get(self):
+        return self.state.getvalue()
+
+
 def encode_clusters(clusters):
    json_encoder = JSONEncoder()
    geojson_encoder = GeoJSONEncoder()
    string_encoder = StringEncoder()
+    csv_encoder = CSVEncoder()

-    encoders = [json_encoder, geojson_encoder, string_encoder]
-
+    encoders = [json_encoder, geojson_encoder, string_encoder, csv_encoder]
    for cluster_id, cluster in clusters.items():
        for encoder in encoders:
            encoder.visitor(cluster_id, cluster)
@@ -89,4 +115,5 @@ def encode_clusters(clusters):
        "json": json_encoder.get(),
        "geojson": geojson_encoder.get(),
        "string": string_encoder.get(),
+        "csv": csv_encoder.get(),
    }
--- a/geoclustering/io.py
+++ b/geoclustering/io.py
@@ -0,0 +1,120 @@
+from pathlib import Path
+from pkg_resources import resource_filename
+import json
+import pandas as pd
+import numpy as np
+import os
+import sys
+
+# kepler is optional, check if installed.
+try:
+    from keplergl import KeplerGl
+except:
+    has_kepler = False
+else:
+    has_kepler = True
+
+
+class HiddenPrints:
+    """Disables stdout prints for a block of code."""
+
+    def __enter__(self):
+        self._original_stdout = sys.stdout
+        sys.stdout = open(os.devnull, "w")
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stdout.close()
+        sys.stdout = self._original_stdout
+
+
+def is_valid_lat(val: str) -> bool:
+    """Given a string, check if it corresponds to a valid decimal latitude value"""
+    try:
+        val = float(val)
+        return val >= -90 and val <= 90
+    except:
+        return False
+
+
+def is_valid_lon(val: str) -> bool:
+    """Given a string, check if it corresponds to a valid decimal longitude value"""
+    try:
+        val = float(val)
+        return val >= -180 and val <= 180
+    except:
+        return False
+
+
+def is_not_none(val: any) -> bool:
+    return val is not None
+
+
+def read_csv_file(filename):
+    """Read input csv file, dropping rows that don't have valid location data."""
+    # replace NaN for all fields not to break kepler parsing.
+    df = pd.read_csv(filename).replace({np.nan: None})
+
+    # construct an index of values with valid lat & lon.
+    valid_index = df.lat.apply(is_valid_lat) & df.lon.apply(is_valid_lon)
+    df_invalid = df[~valid_index]
+
+    count_invalid = len(df_invalid)
+    if count_invalid:
+        df_not_empty = df_invalid[
+            (df_invalid.lat.apply(is_not_none) | df_invalid.lon.apply(is_not_none))
+        ]
+
+        count_not_empty = len(df_not_empty)
+        count_empty = count_invalid - count_not_empty
+
+        if count_empty:
+            print(f"Removed {count_empty} empty coordinate pairs.")
+
+        if count_not_empty:
+            print(f"Removed {count_not_empty} invalid coordinate pairs:")
+            print(df_not_empty[["lat", "lon"]].to_string())
+
+    return df[valid_index]
+
+
+def ensure_file_path(dirname, filename):
+    """Ensure a parent directory exists for a file."""
+    path = Path(dirname)
+    path.mkdir(parents=True, exist_ok=True)
+    return path / filename
+
+
+def write_output_file(dirname, filename, data):
+    """Write a file, ensuring parent directories."""
+    filepath = ensure_file_path(dirname, filename)
+
+    with open(filepath, "w") as f:
+        f.write(data)
+
+    return filepath
+
+
+def write_visualization(dirname, filename, data):
+    """Write a visualization, ensuring parent directories."""
+
+    if not has_kepler:
+        return None
+
+    # Hide kepler stdout output.
+    with HiddenPrints():
+        map = KeplerGl()
+
+    map.add_data(data=data, name="clusters")
+
+    # config configures a default color scheme for our clusters layer.
+    config_file = resource_filename("geoclustering", "kepler_config.json")
+    with open(config_file) as f:
+        map.config = json.loads(f.read())
+
+    filepath = ensure_file_path(dirname, filename)
+
+    # Hide kepler stdout output.
+    with HiddenPrints():
+        map.save_to_html(file_name=str(filepath), center_map=True)
+
+    return filepath
--- a/geoclustering/kepler_config.json
+++ b/geoclustering/kepler_config.json
@@ -9,7 +9,7 @@
          "config": {
            "dataId": "clusters",
            "label": "clusters",
-            "color": [179, 173, 158],
+            "color": [248, 149, 112],
            "highlightColor": [252, 242, 26, 255],
            "columns": { "geojson": "_geojson" },
            "isVisible": true,
@@ -19,16 +19,30 @@
              "thickness": 0.5,
              "strokeColor": null,
              "colorRange": {
-                "name": "Global Warming",
-                "type": "sequential",
+                "name": "Uber Viz Qualitative 4",
+                "type": "qualitative",
                "category": "Uber",
                "colors": [
-                  "#5A1846",
-                  "#900C3F",
-                  "#C70039",
-                  "#E3611C",
-                  "#F1920E",
-                  "#FFC300"
+                  "#12939A",
+                  "#DDB27C",
+                  "#88572C",
+                  "#FF991F",
+                  "#F15C17",
+                  "#223F9A",
+                  "#DA70BF",
+                  "#125C77",
+                  "#4DC19C",
+                  "#776E57",
+                  "#17B8BE",
+                  "#F6D18A",
+                  "#B7885E",
+                  "#FFCB99",
+                  "#F89570",
+                  "#829AE3",
+                  "#E79FD5",
+                  "#1E96BE",
+                  "#89DAC1",
+                  "#B3AD9E"
                ]
              },
              "strokeColorRange": {
--- a/geoclustering/version.py
+++ b/geoclustering/version.py
@@ -0,0 +1,11 @@
+_MAJOR = "0"
+_MINOR = "4"
+# On main and in a nightly release the patch should be one ahead of the last
+# released build.
+_PATCH = "1"
+# This is mainly for nightly builds which have the suffix ".dev$DATE". See
+# https://semver.org/#is-v123-a-semantic-version for the semantics.
+_SUFFIX = ""
+
+VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
+VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
--- a/pytest.ini
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths = tests/
+python_files = *.py
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -e
+
+TAG=$(python -c 'from geoclustering.version import VERSION; print("v" + VERSION)')
+
+read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt
+
+if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]; then
+    git add -A
+    git commit -m "Bump version to $TAG for release" || true && git push
+    echo "Creating new git tag $TAG"
+    git tag "$TAG" -m "$TAG"
+    git push --tags
+else
+    echo "Cancelled"
+    exit 1
+fi
--- a/setup.py
+++ b/setup.py
@@ -1,21 +1,41 @@
 from setuptools import setup

+# version.py defines the VERSION and VERSION_SHORT variables.
+# We use exec here so we don't import cached_path whilst setting up.
+VERSION = {}  # type: ignore
+with open("geoclustering/version.py", "r") as version_file:
+    exec(version_file.read(), VERSION)
+
 setup(
-    name="geocluster",
-    version="0.1",
-    description="",
+    name="geoclustering",
+    version=VERSION["VERSION"],
+    description="📍 command-line tool for clustering geolocations.",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    classifiers=[
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3",
+    ],
    author="Bellingcat",
-    packages=["geocluster"],
-    entry_points={"console_scripts": ["geocluster = geocluster.__main__:main"]},
+    author_email="tech@bellingcat.com",
+    license="MIT",
+    packages=["geoclustering"],
+    package_data={"geoclustering": ["kepler_config.json"]},
+    keywords=["cluster", "gis", "pattern-analysis"],
+    entry_points={"console_scripts": ["geoclustering = geoclustering.__main__:main"]},
    install_requires=[
        "click",
        "geojson",
-        "keplergl",
        "numpy",
        "pandas",
        "scikit-learn",
    ],
-    extras_require={"dev": ["black", "wheel"]},
+    extras_require={
+        "dev": ["black", "wheel", "pre-commit", "pytest"],
+        "full": ["keplergl"],
+    },
    include_package_data=True,
    zip_safe=False,
 )
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/clustering.py
+++ b/tests/clustering.py
@@ -0,0 +1,41 @@
+from geoclustering.clustering import cluster_locations
+from tests.helpers import read_fixture_csv
+
+
+df = read_fixture_csv("clustering.csv")
+
+
+def has_member(list, name):
+    return any(x for x in list if x["name"] == name)
+
+
+def test_clustering_all():
+    # there should be one cluster with all members but Erin.
+    res = cluster_locations(
+        df=df, algorithm="dbscan", radius_km=1.97, min_cluster_size=4
+    )
+    assert len(res.values()) == 1
+    assert len(res[0]) == 4
+
+
+def test_clustering_split():
+    res = cluster_locations(
+        df=df, algorithm="dbscan", radius_km=0.5, min_cluster_size=2
+    )
+    # there should be two cluster: Alice & Bob and Carol & Dan
+    assert len(res.values()) == 2
+    cluster_one = res[0]
+    cluster_two = res[1]
+    assert len(cluster_one) == 2
+    assert has_member(cluster_one, "Alice")
+    assert has_member(cluster_one, "Bob")
+    assert has_member(cluster_two, "Carol")
+    assert has_member(cluster_two, "Dan")
+
+
+def test_clustering_none():
+    # there should be no clusters now.
+    res = cluster_locations(
+        df=df, algorithm="dbscan", radius_km=0.5, min_cluster_size=3
+    )
+    assert len(res.values()) == 0
--- a/tests/encoding.py
+++ b/tests/encoding.py
@@ -0,0 +1,30 @@
+from geoclustering.encoding import encode_clusters
+from tests.helpers import read_fixture_csv, read_fixture_content
+
+
+df = read_fixture_csv("clustering.csv")
+
+
+def test_encoders():
+    clusters = {
+        0: [
+            {"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362},
+            {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097},
+        ],
+        1: [
+            {"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246},
+            {
+                "id": 4,
+                "name": "Dan",
+                "lat": 52.52443559865125,
+                "lon": 13.41261723049818,
+            },
+        ],
+    }
+
+    res = encode_clusters(clusters)
+
+    assert res["string"] == read_fixture_content("snapshots/result.txt")
+    assert res["json"] == read_fixture_content("snapshots/result.json")
+    assert res["geojson"] == read_fixture_content("snapshots/result.geojson")
+    assert res["csv"] == read_fixture_content("snapshots/result.csv")
--- a/tests/fixtures/clustering.csv
+++ b/tests/fixtures/clustering.csv
@@ -0,0 +1,6 @@
+id,name,lat,lon
+1,Alice,52.523955,13.442362
+2,Bob,52.526659,13.448097
+3,Carol,52.525626,13.419246
+4,Dan,52.52443559865125,13.41261723049818
+5,Erin,52.524838991760774,13.383188597040382
--- a/tests/fixtures/io.csv
+++ b/tests/fixtures/io.csv
@@ -0,0 +1,9 @@
+id,name,lat,lon
+1,Alice,,
+2,,52.523955,13.442362
+,,-90.12,132.23
+4,,78.234,-180.1212
+5,Bob,52.524838991760774,13.383188597040382
+6,Peter,91.234,
+7,Horst,,23.23
+7,Erin,foo,bar
--- a/tests/fixtures/mock1000.csv
+++ b/tests/fixtures/mock1000.csv
--- a/tests/fixtures/mock50.csv
+++ b/tests/fixtures/mock50.csv
@@ -1,51 +1,51 @@
-id,name,lat,lon
-1,Bonnibelle Mathwen,40.1324085,64.4911086
-2,Fayette Elt,49.6235379,6.2379992
-3,Jandy Cooch,-7.5874497,110.7420464
-4,Robb Gerbel,22.2455315,-80.3936994
-5,Silvie Clipson,40.3418956,21.5118754
-6,Kristina Izakoff,30.741991,121.341969
-7,Ricky Sweeting,11.2666664,122.5333328
-8,Quintin Hazart,35.119385,109.167435
-9,Sholom Kilmister,55.7393377,37.6642542
-10,Misty Dooher,49.9776657,20.9421091
-11,Knox Phython,-8.4985,123.5226
-12,Shay Davidy,14.4142191,120.9495257
-13,Dre Benoey,-31.4561755,-64.2111608
-14,Prudi Tomek,40.692169,117.163821
-15,Evey Ealam,31.123586,114.893666
-16,Norry Urch,45.8022541,17.497172
-17,Valerye Dumberell,50.4438122,48.1450932
-18,Freddy Furtado,58.3767785,11.6764538
-19,Catarina Samett,50.4034992,26.141892
-20,Lidia Muckian,-38.7359018,-72.5903739
-21,Stacey Dockrey,29.741986,106.273576
-22,Norri Bonhill,60.6184239,16.7769535
-23,Florence Pretsel,55.96667,25.15
-24,Marten Matantsev,50.9603536,14.3596743
-25,Claiborn Everall,43.884893,-0.5046003
-26,Randolf Hailey,49.4679131,18.2282007
-27,Meggi Kirkebye,57.6888453,11.9943311
-28,Denna Le Grove,16.7124054,98.5746649
-29,Randy Verheijden,40.4722617,-7.9751886
-30,Caterina Blancowe,35.422892,103.352654
-31,Joanne Adamovitch,55.9251242,39.4489055
-32,Orazio Coppins,,111.6556388
-33,Anastassia Bennedsen,45.212088,130.478187
-34,Linoel Ruggier,22.066171,107.781956
-35,Paulina Moralis,-11.806679,-77.1657716
-36,Ambur Outhwaite,59.4033695,17.9443213
-37,Laetitia Aspland,37.6086169,138.9089988
-38,Dew Moxstead,6.1317011,-75.6382657
-39,Berna Klaiser,40.1394691,-8.3092933
-40,Krystle Ingold,7.1518505,0.4738293
-41,Cassaundra Cuffin,56.6342788,36.885813
-42,Malanie Harpin,46.9,109.75
-43,Laurence Stothart,39.912765,116.18362
-44,Luz O'Siaghail,40.4476834,25.5917918
-45,Brittni Garrod,59.0836123,16.18741
-46,Karlie Semrad,-8.793392,121.9330894
-47,Leigh Allderidge,45.768045,15.947739
-48,Ashlin Gogerty,50.3250139,34.9100068
-49,Mozelle De Launde,53.31611,40.70806
-50,Ema le Keux,41.6315023,19.9310781
+id,name,lat,lon
+1,Bonnibelle Mathwen,40.1324085,64.4911086
+2,Fayette Elt,49.6235379,6.2379992
+3,Jandy Cooch,-7.5874497,110.7420464
+4,Robb Gerbel,22.2455315,-80.3936994
+5,Silvie Clipson,40.3418956,21.5118754
+6,Kristina Izakoff,30.741991,121.341969
+7,Ricky Sweeting,11.2666664,122.5333328
+8,Quintin Hazart,35.119385,109.167435
+9,Sholom Kilmister,55.7393377,37.6642542
+10,Misty Dooher,49.9776657,20.9421091
+11,Knox Phython,-8.4985,123.5226
+12,Shay Davidy,14.4142191,120.9495257
+13,Dre Benoey,-31.4561755,-64.2111608
+14,Prudi Tomek,40.692169,117.163821
+15,Evey Ealam,31.123586,114.893666
+16,Norry Urch,45.8022541,17.497172
+17,Valerye Dumberell,50.4438122,48.1450932
+18,Freddy Furtado,58.3767785,11.6764538
+19,Catarina Samett,50.4034992,26.141892
+20,Lidia Muckian,-38.7359018,-72.5903739
+21,Stacey Dockrey,29.741986,106.273576
+22,Norri Bonhill,60.6184239,16.7769535
+23,Florence Pretsel,55.96667,25.15
+24,Marten Matantsev,50.9603536,14.3596743
+25,Claiborn Everall,43.884893,-0.5046003
+26,Randolf Hailey,49.4679131,18.2282007
+27,Meggi Kirkebye,57.6888453,11.9943311
+28,Denna Le Grove,16.7124054,98.5746649
+29,Randy Verheijden,40.4722617,-7.9751886
+30,Caterina Blancowe,35.422892,103.352654
+31,Joanne Adamovitch,55.9251242,39.4489055
+32,Orazio Coppins,,111.6556388
+33,Anastassia Bennedsen,45.212088,130.478187
+34,Linoel Ruggier,22.066171,107.781956
+35,Paulina Moralis,-11.806679,-77.1657716
+36,Ambur Outhwaite,59.4033695,17.9443213
+37,Laetitia Aspland,37.6086169,138.9089988
+38,Dew Moxstead,6.1317011,-75.6382657
+39,Berna Klaiser,40.1394691,-8.3092933
+40,Krystle Ingold,7.1518505,0.4738293
+41,Cassaundra Cuffin,56.6342788,36.885813
+42,Malanie Harpin,46.9,109.75
+43,Laurence Stothart,39.912765,116.18362
+44,Luz O'Siaghail,40.4476834,25.5917918
+45,Brittni Garrod,59.0836123,16.18741
+46,Karlie Semrad,-8.793392,121.9330894
+47,Leigh Allderidge,45.768045,15.947739
+48,Ashlin Gogerty,50.3250139,34.9100068
+49,Mozelle De Launde,53.31611,40.70806
+50,Ema le Keux,41.6315023,19.9310781
--- a/tests/fixtures/snapshots/result.csv
+++ b/tests/fixtures/snapshots/result.csv
@@ -0,0 +1,5 @@
+"cluster_id","id","name","lat","lon"
+0,1,"Alice",52.523955,13.442362
+0,2,"Bob",52.526659,13.448097
+1,3,"Carol",52.525626,13.419246
+1,4,"Dan",52.52443559865125,13.41261723049818
--- a/tests/fixtures/snapshots/result.geojson
+++ b/tests/fixtures/snapshots/result.geojson
@@ -0,0 +1 @@
+{"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.442362, 52.523955]}, "properties": {"id": 1, "name": "Alice", "cluster_id": 0}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.448097, 52.526659]}, "properties": {"id": 2, "name": "Bob", "cluster_id": 0}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.419246, 52.525626]}, "properties": {"id": 3, "name": "Carol", "cluster_id": 1}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.412617, 52.524436]}, "properties": {"id": 4, "name": "Dan", "cluster_id": 1}}]}
--- a/tests/fixtures/snapshots/result.json
+++ b/tests/fixtures/snapshots/result.json
@@ -0,0 +1 @@
+[{"cluster_id": 0, "points": [{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362}, {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097}]}, {"cluster_id": 1, "points": [{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246}, {"id": 4, "name": "Dan", "lat": 52.52443559865125, "lon": 13.41261723049818}]}]
--- a/tests/fixtures/snapshots/result.txt
+++ b/tests/fixtures/snapshots/result.txt
@@ -0,0 +1,7 @@
+Cluster 0
+id 1, name Alice, lat 52.523955, lon 13.442362
+id 2, name Bob, lat 52.526659, lon 13.448097
+
+Cluster 1
+id 3, name Carol, lat 52.525626, lon 13.419246
+id 4, name Dan, lat 52.52443559865125, lon 13.41261723049818
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -0,0 +1,16 @@
+import os
+from geoclustering.io import read_csv_file
+
+
+def get_fixture_path(filename):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    return os.path.join(dir_path, "fixtures", filename)
+
+
+def read_fixture_csv(filename):
+    return read_csv_file(get_fixture_path(filename))
+
+
+def read_fixture_content(filename):
+    with open(get_fixture_path(filename)) as f:
+        return f.read()
--- a/tests/io.py
+++ b/tests/io.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+import shutil
+from geoclustering.io import write_output_file
+from tests.helpers import read_fixture_csv
+
+
+def test_csv_filters():
+    df = read_fixture_csv("io.csv")
+    # entries 2 & 5 in fixture are valid.
+    assert len(df) == 2
+    assert df.iloc[0]["name"] == None
+    assert df.iloc[1]["name"] == "Bob"
+
+
+def test_write_output_file():
+    p = "./this/dir/does/not/exist"
+    f = "test.txt"
+    write_output_file(p, f, "test")
+
+    path = Path(p) / f
+
+    with open(path) as f:
+        assert f.read() == "test"
+
+    shutil.rmtree(Path("./this"))
Author	SHA1	Message	Date
Miguel Sozinho Ramalho	e9b7680263	migrate gh artifact actions to v4 (#20 ) * migrate gh artifact actions to v4 from migration guide no breaking changes apply here. * updates pipfile.lock dependency versions * updates CI due to pytest issue see https://github.com/scipy/scipy/issues/22236 * bump to python 3.12 * revert to py3.10	2025-01-09 15:47:27 +00:00
msramalho	de4d4689b9	Bump version to v0.4.1 for release	2022-09-27 14:49:17 +01:00
msramalho	484d3cb02c	adds: tests for csv	2022-09-27 14:49:04 +01:00
msramalho	65366816fa	updates readme with release info	2022-09-27 14:43:05 +01:00
msramalho	de91354867	Bump version to v0.4.0 for release	2022-09-27 14:41:48 +01:00
Kashyap Maheshwari	e9a7519168	Add new output format: csv with cluster info (#18 ) Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>	2022-09-27 14:39:50 +01:00
msramalho	dc7e12642e	adds dev pipenv instructions	2022-09-27 13:59:49 +01:00
msramalho	93c51d7a80	closes #17	2022-09-27 13:58:17 +01:00
msramalho	f77d1d9d62	closes #17	2022-09-27 13:58:00 +01:00
Felix Spöttel	99e844c6ce	fix: compatibility with python < 3.8 (#16 ) * ci: run tests in python 3.7 as well	2022-07-07 10:21:21 +02:00
msramalho	ff094a1d3e	fix: unused import, protected keyword use	2022-07-05 16:36:54 +02:00
Felix Spöttel	926aaf73d6	Bump version to v0.3.0 for release	2022-07-04 16:43:48 +02:00
Felix Spöttel	6a5cb3c3c3	feat: optional kepler.gl integration (#12 )	2022-07-04 16:22:27 +02:00
Felix Spöttel	d252c6b8f3	test: add test suite (#7 ) * add pre-commit hook * improve logging of inconsistent data	2022-07-04 13:54:07 +02:00
Felix Spöttel	1c5d0f649e	docs: update cli documentation	2022-07-01 19:18:06 +02:00
Felix Spöttel	6ed01417c3	docs: update install section	2022-07-01 18:57:40 +02:00
Felix Spöttel	3cc3c30e03	Bump version to v0.2.1 for release	2022-07-01 18:52:00 +02:00
Felix Spöttel	c9d36c6bf3	feat: print success output	2022-07-01 18:51:25 +02:00
Felix Spöttel	62da0806c7	fix: debug prints	2022-07-01 18:48:17 +02:00
Felix Spöttel	8657bd73ec	Bump version to v0.2.0 for release	2022-07-01 18:15:07 +02:00
Felix Spöttel	e633665813	chore: update license	2022-07-01 18:12:00 +02:00
Felix Spöttel	cff5256d06	feat: add `--debug` flag, improve logging & help closes #9	2022-07-01 17:53:09 +02:00
Felix Spöttel	4dfa08bbbc	feat: add `--open` flag (#11 ) closes #5	2022-07-01 17:08:53 +02:00
Felix Spöttel	eaa4022b70	ci: use pipfile.lock as cache key	2022-07-01 17:05:43 +02:00
Felix Spöttel	1cb5541baa	chore: remove clustering print	2022-07-01 17:04:56 +02:00
Felix Spöttel	b40074317c	feat: extend kepler.gl color range closes #10	2022-07-01 17:04:33 +02:00
Miguel Sozinho Ramalho	f1053953ba	feat: auto-deploy to pypi (#8 )	2022-07-01 15:23:50 +01:00
				`@@ -0,0 +1 @@`
				{"type": "FeatureCollection", "features": [{"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.442362, 52.523955]}, "properties": {"id": 1, "name": "Alice", "cluster_id": 0}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.448097, 52.526659]}, "properties": {"id": 2, "name": "Bob", "cluster_id": 0}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.419246, 52.525626]}, "properties": {"id": 3, "name": "Carol", "cluster_id": 1}}, {"type": "Feature", "geometry": {"type": "Point", "coordinates": [13.412617, 52.524436]}, "properties": {"id": 4, "name": "Dan", "cluster_id": 1}}]}
				`@@ -0,0 +1 @@`
				`[{"cluster_id": 0, "points": [{"id": 1, "name": "Alice", "lat": 52.523955, "lon": 13.442362}, {"id": 2, "name": "Bob", "lat": 52.526659, "lon": 13.448097}]}, {"cluster_id": 1, "points": [{"id": 3, "name": "Carol", "lat": 52.525626, "lon": 13.419246}, {"id": 4, "name": "Dan", "lat": 52.52443559865125, "lon": 13.41261723049818}]}]`