used numeric index instead of label name for node keys, removed redundant fields, to reduce size of dataset_entities.json and increase performance

2026-06-08 03:28:33 +03:00 · 2022-12-16 01:30:54 -06:00
parent 9f39ceee7b
commit 44ea5a2ceb
2 changed files with 7 additions and 4 deletions
--- a/public/dataset_entities.json
+++ b/public/dataset_entities.json
--- a/scripts/generate_visualisation.py
+++ b/scripts/generate_visualisation.py
@@ -60,7 +60,7 @@ if __name__ == "__main__":
    ).reset_index(drop=True)
    _edges_df = nx.to_pandas_edgelist(G=G)

-    nodes_df = _nodes_df[["x", "y", "label", "size", "frequency", "Modularity Class"]]
+    nodes_df = _nodes_df[["x", "y", "label", "size", "Modularity Class"]]
    edges_df = _edges_df[["source", "target"]]

    clusters_to_combine = {
@@ -73,11 +73,14 @@ if __name__ == "__main__":
        lambda c: "100" if c in clusters_to_combine else str(c)
    )
    nodes_df.drop("Modularity Class", axis="columns", inplace=True)
-    nodes_df["key"] = nodes_df["label"]
+
+    label_to_index = {t[0] : i for i, t in enumerate(Counter(list(edges_df['source']) + list(edges_df['target'])).most_common())}
+
+    nodes_df["key"] = nodes_df['label'].map(label_to_index)
    nodes_df["size"] /= NODE_SCALING
    nodes = nodes_df.to_dict(orient="records")

-    edges = [[e["source"], e["target"]] for e in edges_df.to_dict(orient="records")]
+    edges = [[label_to_index[e["source"]], label_to_index[e["target"]]] for e in edges_df.to_dict(orient="records")]
    data = {
        "nodes": nodes,
        "edges": edges,