used numeric index instead of label name for node keys, removed redundant fields, to reduce size of dataset_entities.json and increase performance

This commit is contained in:
Tristan Lee
2022-12-16 01:30:54 -06:00
parent 9f39ceee7b
commit 44ea5a2ceb
2 changed files with 7 additions and 4 deletions

File diff suppressed because one or more lines are too long

View File

@@ -60,7 +60,7 @@ if __name__ == "__main__":
).reset_index(drop=True)
_edges_df = nx.to_pandas_edgelist(G=G)
nodes_df = _nodes_df[["x", "y", "label", "size", "frequency", "Modularity Class"]]
nodes_df = _nodes_df[["x", "y", "label", "size", "Modularity Class"]]
edges_df = _edges_df[["source", "target"]]
clusters_to_combine = {
@@ -73,11 +73,14 @@ if __name__ == "__main__":
lambda c: "100" if c in clusters_to_combine else str(c)
)
nodes_df.drop("Modularity Class", axis="columns", inplace=True)
nodes_df["key"] = nodes_df["label"]
label_to_index = {t[0] : i for i, t in enumerate(Counter(list(edges_df['source']) + list(edges_df['target'])).most_common())}
nodes_df["key"] = nodes_df['label'].map(label_to_index)
nodes_df["size"] /= NODE_SCALING
nodes = nodes_df.to_dict(orient="records")
edges = [[e["source"], e["target"]] for e in edges_df.to_dict(orient="records")]
edges = [[label_to_index[e["source"]], label_to_index[e["target"]]] for e in edges_df.to_dict(orient="records")]
data = {
"nodes": nodes,
"edges": edges,