Node Regression with Subgraph and Graph Sample projections
This Jupyter notebook is hosted here in the Neo4j Graph Data Science Client Github repository.
For a video presentation of a former version of this notebook, see the talk Fundamentals of Neo4j Graph Data Science Series 2.x – Pipelines and More that was given at the NODES 2022 conference.
The notebook exemplifies using a Node Regression pipeline. It also contains many examples of using
-
Convenience objects
-
Subgraph projection
-
Graph sample projection
It is written in pure Python, to showcase the GDS Python Client’s ability to abstract away from Cypher queries.
1. The dataset
Our input graph represents Wikipedia pages on particular topics, and how they link to each other:
-
Chameleons
-
Squirrels
-
Crocodiles
The features are presences of certain informative nouns in the text of the page. The target is the average monthly traffic of the page.
The dataset was first published in Multi-scale Attributed Node Embedding by B. Rozemberczki, C. Allen and R. Sarkar, eprint 1909.13021. The version hosted here was taken from SNAP on 2022-11-14.
2. Pre-requisites
In order to run this pipeline, you must have a running Neo4j DBMS with a recent version of the Neo4j Graph Data Science plugin installed. These requirements are satisfied if you have an AuraDS instance active and running.
# First, we must install the GDS Python Client
%pip install graphdatascience
import os
# Then, we connect to our Neo4j DBMS hosting the Graph Data Science library
from graphdatascience import GraphDataScience
# Get Neo4j DB URI, credentials and name from environment if applicable
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
NEO4J_AUTH = None
NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
if os.environ.get("NEO4J_USER") and os.environ.get("NEO4J_PASSWORD"):
NEO4J_AUTH = (
os.environ.get("NEO4J_USER"),
os.environ.get("NEO4J_PASSWORD"),
)
gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)
# Test our connection and print the Graph Data Science library version
print(gds.server_version())
from graphdatascience.server_version.server_version import ServerVersion
assert gds.server_version() >= ServerVersion(2, 1, 0)
# Importing the dataset
# The dataset is sourced from this GitHub repository
baseUrl = (
"https://raw.githubusercontent.com/neo4j/graph-data-science-client/main/examples/datasets/wikipedia-animals-pages"
)
# Constraints to speed up importing
gds.run_cypher(
"""
CREATE CONSTRAINT chameleons
FOR (c:Chameleon)
REQUIRE c.id IS NODE KEY
"""
)
gds.run_cypher(
"""
CREATE CONSTRAINT crocodiles
FOR (c:Crocodile)
REQUIRE c.id IS NODE KEY
"""
)
gds.run_cypher(
"""
CREATE CONSTRAINT squirrels
FOR (s:Squirrel)
REQUIRE s.id IS NODE KEY
"""
)
# Create nodes and relationships
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_edges.csv' AS row
MERGE (c1:Chameleon {id: row.id1})
MERGE (c2:Chameleon {id: row.id2})
MERGE (c1)-[:LINK]->(c2)
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_edges.csv' AS row
MERGE (c1:Crocodile {id: row.id1})
MERGE (c2:Crocodile {id: row.id2})
MERGE (c1)-[:LINK]->(c2)
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_edges.csv' AS row
MERGE (s1:Squirrel {id: row.id1})
MERGE (s2:Squirrel {id: row.id2})
MERGE (s1)-[:LINK]->(s2)
""",
{"baseUrl": baseUrl},
)
# Create target properties
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_target.csv' AS row
MATCH (c:Chameleon {id: row.id})
SET c.target = toInteger(row.target)
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_target.csv' AS row
MATCH (c:Crocodile {id: row.id})
SET c.target = toInteger(row.target)
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_target.csv' AS row
MATCH (s:Squirrel {id: row.id})
SET s.target = toInteger(row.target)
""",
{"baseUrl": baseUrl},
)
# Create feature vectors
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_features.csv' AS row
MATCH (c:Chameleon {id: row.id})
WITH c, split(row.features, '|') AS features
SET c.features = features
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_features.csv' AS row
MATCH (c:Crocodile {id: row.id})
WITH c, split(row.features, '|') AS features
SET c.features = features
""",
{"baseUrl": baseUrl},
)
gds.run_cypher(
"""
LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_features.csv' AS row
MATCH (c:Squirrel {id: row.id})
WITH c, split(row.features, '|') AS features
SET c.features = features
""",
{"baseUrl": baseUrl},
)
3. Preparing the dataset for the pipeline
In order to use the dataset, we must prepare the features in a format that the model supports and can work well with. In their raw form, the features are ids of particular words, and therefore are not suitable as input to linear regression.
To overcome this, we will use a one-hot encoding. This will produce features that work well for linear regression. We begin by learning the dictionaries of nouns across the node sets. We create a node to host the dictionary, then we use it to one-hot encode all feature vectors.
# Construct one-hot dictionaries
gds.run_cypher(
"""
MATCH (s:Chameleon)
WITH s.features AS features
UNWIND features AS feature
WITH feature
ORDER BY feature ASC
WITH collect(distinct feature) AS orderedTotality
CREATE (:Feature {animal: 'chameleon', totality: orderedTotality})
RETURN orderedTotality
"""
)
gds.run_cypher(
"""
MATCH (s:Crocodile)
WITH s.features AS features
UNWIND features AS feature
WITH feature
ORDER BY feature ASC
WITH collect(distinct feature) AS orderedTotality
CREATE (:Feature {animal: 'crocodile', totality: orderedTotality})
RETURN orderedTotality
"""
)
gds.run_cypher(
"""
MATCH (s:Squirrel)
WITH s.features AS features
UNWIND features AS feature
WITH feature
ORDER BY feature ASC
WITH collect(distinct feature) AS orderedTotality
CREATE (:Feature {animal: 'squirrel', totality: orderedTotality})
RETURN orderedTotality
"""
)
# Do one-hot encoding
gds.run_cypher(
"""
MATCH (f:Feature {animal: 'chameleon'})
MATCH (c:Chameleon)
SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)
gds.run_cypher(
"""
MATCH (f:Feature {animal: 'crocodile'})
MATCH (c:Crocodile)
SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)
gds.run_cypher(
"""
MATCH (f:Feature {animal: 'squirrel'})
MATCH (c:Squirrel)
SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)
# First, let's project our graph into the GDS Graph Catalog
# We will use a native projection to begin with
G_animals, projection_result = gds.graph.project(
"wiki_animals",
["Chameleon", "Squirrel", "Crocodile"],
{"LINK": {"orientation": "UNDIRECTED"}},
nodeProperties=["features_one_hot", "target"],
)
print(projection_result[["graphName", "nodeCount", "relationshipCount"]])
4. Connectivity
In graph analysis, it is common to operate only over connected graphs. That is, graphs that consist of only a single component. The reason for this is that in most cases, information does not flow where there are no connections.
The fastest way to determine the number of components in our graph is to use the WCC (Weakly Connected Components) algorithm.
# We use the WCC algorithm to see how many components we have
wcc_result = gds.wcc.mutate(G_animals, mutateProperty="wcc_component")
print(wcc_result[["computeMillis", "componentCount"]])
5. Component separation
Learning that our graph consists of three components, we will next
separate the components into separate graphs. We will use the
subgraph
projection to accomplish this. We will create one subgraph
for each of the components.
# First, we stream the component ids
components = gds.graph.nodeProperty.stream(G_animals, "wcc_component")
# Second, we compute the unique component ids
component_ids = components["propertyValue"].unique()
# Third, we project a subgraph for each component
component_graphs = [
gds.beta.graph.project.subgraph(
f"animals_component_{component_id}",
G_animals,
f"n.wcc_component = {component_id}",
"*",
)[0]
for component_id in component_ids
]
# Lastly, we map the node labels in the graphs to the graph
graph_components_by_labels = {str(G_component.node_labels()): G_component for G_component in component_graphs}
print({k: v.name() for k, v in graph_components_by_labels.items()})
# Now, we are only interested in the Chameleon graph,
# so we will drop the other graphs and define a better variable for the one we keep
graph_components_by_labels[str(["Crocodile"])].drop()
graph_components_by_labels[str(["Squirrel"])].drop()
G_chameleon = graph_components_by_labels[str(["Chameleon"])]
# With the graph object G_chameleon, we can inspect some statistics
print("#nodes: " + str(G_chameleon.node_count()))
print("#relationships: " + str(G_chameleon.relationship_count()))
print("Degree distribution")
print("=" * 25)
print(G_chameleon.degree_distribution().sort_index())
6. Now, let’s construct a training pipeline!
We will create a Node Regression pipeline, and then
-
configure the splitting
-
add model candidates
-
configure auto-tuning
-
add node property steps
-
select model features
The pipeline lives in the Pipeline Catalog, and we are operating it through the Pipeline object, for maximum convenience.
# Now, let's construct a training pipeline!
chameleons_nr_training = gds.nr_pipe("node_regression_pipeline__Chameleons")
# We configure the splitting
chameleons_nr_training.configureSplit(validationFolds=5, testFraction=0.2)
# We add a set of model candidates
# A linear regression model with the learningRate parameter in a search space
chameleons_nr_training.addLinearRegression(
penalty=1e-5,
patience=3,
tolerance=1e-5,
minEpochs=20,
maxEpochs=500,
learningRate={"range": [100, 1000]}, # We let the auto-tuner find a good value
)
# Let's try a few different models
chameleons_nr_training.configureAutoTuning(maxTrials=10)
# Our input feature dimension is 3132
# We can reduce the dimension to speed up training using a FastRP node embedding
chameleons_nr_training.addNodeProperty(
"fastRP",
embeddingDimension=256,
propertyRatio=0.8,
featureProperties=["features_one_hot"],
mutateProperty="frp_embedding",
randomSeed=420,
)
# And finally we select what features the model should be using
# We rely on the FastRP embedding solely, because it encapsulates the one-hot encoded source features
chameleons_nr_training.selectFeatures("frp_embedding")
# The training pipeline is now fully configured and ready to be run!
# We use the training pipeline to train a model
nc_model, train_result = chameleons_nr_training.train(
G_chameleon, # First, we use the entire Chameleon graph
modelName="chameleon_nr_model",
targetNodeLabels=["Chameleon"],
targetProperty="target",
metrics=["MEAN_SQUARED_ERROR", "MEAN_ABSOLUTE_ERROR"],
randomSeed=420,
)
print("Winning model parameters: \n\t\t" + str(train_result["modelInfo"]["bestParameters"]))
print()
print("MEAN_SQUARED_ERROR test score: " + str(train_result["modelInfo"]["metrics"]["MEAN_SQUARED_ERROR"]["test"]))
print("MEAN_ABSOLUTE_ERROR test score: " + str(train_result["modelInfo"]["metrics"]["MEAN_ABSOLUTE_ERROR"]["test"]))
# Let's sample the graph to see if we can get a similarly good model
G_chameleon_sample, _ = gds.alpha.graph.sample.rwr(
"cham_sample",
G_chameleon,
samplingRatio=0.30, # We'll use 30% of the graph
)
# Now we can use the same training pipeline to train another model, but faster!
nc_model_sample, train_result_sample = chameleons_nr_training.train(
G_chameleon_sample,
modelName="chameleon_nr_model_sample",
targetNodeLabels=["Chameleon"],
targetProperty="target",
metrics=["MEAN_SQUARED_ERROR", "MEAN_ABSOLUTE_ERROR"],
randomSeed=420,
)
print("Winning model parameters: \n\t\t" + str(train_result_sample["modelInfo"]["bestParameters"]))
print()
print(
"MEAN_SQUARED_ERROR test score: "
+ str(train_result_sample["modelInfo"]["metrics"]["MEAN_SQUARED_ERROR"]["test"])
)
print(
"MEAN_ABSOLUTE_ERROR test score: "
+ str(train_result_sample["modelInfo"]["metrics"]["MEAN_ABSOLUTE_ERROR"]["test"])
)
# Let's see what our models predict
# The speed-trained model on 24% training data (30% sample - 20% test set)
predicted_targets_sample = nc_model_sample.predict_stream(G_chameleon)
# The fully trained model on 80% training data (20% test set)
predicted_targets_full = nc_model.predict_stream(G_chameleon)
# The original training data for comparison
real_targets = gds.graph.nodeProperty.stream(G_chameleon, "target")
# Merging the data frames
merged_full = real_targets.merge(predicted_targets_full, left_on="nodeId", right_on="nodeId")
merged_all = merged_full.merge(predicted_targets_sample, left_on="nodeId", right_on="nodeId")
# Look at the last 10 rows
print(merged_all.tail(10))