function generate_neo4j_schema_report
Generates a comprehensive schema report of a Neo4j graph database, including node labels, relationships, properties, constraints, indexes, and sample data, outputting multiple file formats (JSON, HTML, Python snippets, Cypher examples).
/tf/active/vicechatdev/neo4j_schema_report.py
24 - 243
complex
Purpose
This function connects to a Neo4j database and performs extensive introspection to document the complete database schema. It analyzes node labels, relationship types, property keys, constraints, indexes, and the connections between different node types. The function generates multiple output files including JSON schema data, diagram data for visualization, an HTML report, Python code snippets for interacting with the schema, and Cypher query examples. This is useful for database documentation, onboarding new developers, schema analysis, and generating boilerplate code for working with the database.
Source Code
def generate_neo4j_schema_report(
neo4j_uri="bolt://localhost:7687",
neo4j_username="neo4j",
neo4j_password="password",
output_dir="./neo4j_schema"
):
"""
Generate a comprehensive schema report of a Neo4j database
Parameters:
- neo4j_uri: Neo4j server URI
- neo4j_username: Neo4j username
- neo4j_password: Neo4j password
- output_dir: Directory to save the report files
"""
print("Connecting to Neo4j and generating schema report...")
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Connect to Neo4j
driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))
try:
# Dictionary to store all schema information
schema_info = {
"node_labels": [],
"relationship_types": [],
"property_keys": [],
"constraints": [],
"indexes": [],
"node_counts": {},
"relationship_counts": {},
"sample_nodes": {},
"sample_relationships": {},
"node_properties": defaultdict(set),
"relationship_properties": defaultdict(set),
"node_relationships": defaultdict(lambda: defaultdict(list))
}
with driver.session() as session:
# Get node labels and counts
print("Retrieving node labels and counts...")
result = session.run("""
CALL db.labels() YIELD label
RETURN label, count(label) as count
ORDER BY label
""")
for record in result:
label = record["label"]
schema_info["node_labels"].append(label)
# Get node counts per label
for label in schema_info["node_labels"]:
count_result = session.run(f"MATCH (n:{label}) RETURN count(n) as count")
count = count_result.single()["count"]
schema_info["node_counts"][label] = count
# Get relationship types and counts
print("Retrieving relationship types and counts...")
result = session.run("""
CALL db.relationshipTypes() YIELD relationshipType
RETURN relationshipType
ORDER BY relationshipType
""")
for record in result:
rel_type = record["relationshipType"]
schema_info["relationship_types"].append(rel_type)
# Get relationship counts per type
for rel_type in schema_info["relationship_types"]:
count_result = session.run(f"MATCH ()-[r:{rel_type}]->() RETURN count(r) as count")
count = count_result.single()["count"]
schema_info["relationship_counts"][rel_type] = count
# Get property keys
print("Retrieving property keys...")
result = session.run("""
CALL db.propertyKeys() YIELD propertyKey
RETURN propertyKey
ORDER BY propertyKey
""")
for record in result:
schema_info["property_keys"].append(record["propertyKey"])
# Get constraints
print("Retrieving constraints...")
try:
# For Neo4j 4.x+
result = session.run("SHOW CONSTRAINTS")
for record in result:
schema_info["constraints"].append(dict(record))
except:
# For older Neo4j versions
try:
result = session.run("CALL db.constraints()")
for record in result:
schema_info["constraints"].append(dict(record))
except:
print("Could not retrieve constraints information.")
# Get indexes
print("Retrieving indexes...")
try:
# For Neo4j 4.x+
result = session.run("SHOW INDEXES")
for record in result:
schema_info["indexes"].append(dict(record))
except:
# For older Neo4j versions
try:
result = session.run("CALL db.indexes()")
for record in result:
schema_info["indexes"].append(dict(record))
except:
print("Could not retrieve indexes information.")
# Get node properties per label
print("Analyzing node properties per label...")
for label in schema_info["node_labels"]:
# Get a sample node to see its properties
result = session.run(f"""
MATCH (n:{label})
RETURN n LIMIT 1
""")
record = result.single()
if record:
node = record["n"]
schema_info["sample_nodes"][label] = dict(node)
# Record all properties for this label
for key in node.keys():
schema_info["node_properties"][label].add(key)
# Convert sets to lists for JSON serialization
for label in schema_info["node_properties"]:
schema_info["node_properties"][label] = sorted(list(schema_info["node_properties"][label]))
# Get relationship properties per type
print("Analyzing relationship properties per type...")
for rel_type in schema_info["relationship_types"]:
# Get a sample relationship to see its properties
result = session.run(f"""
MATCH ()-[r:{rel_type}]->()
RETURN r LIMIT 1
""")
record = result.single()
if record:
rel = record["r"]
schema_info["sample_relationships"][rel_type] = dict(rel)
# Record all properties for this relationship type
for key in rel.keys():
schema_info["relationship_properties"][rel_type].add(key)
# Convert sets to lists for JSON serialization
for rel_type in schema_info["relationship_properties"]:
schema_info["relationship_properties"][rel_type] = sorted(list(schema_info["relationship_properties"][rel_type]))
# Analyze node relationships (which labels connect to which)
print("Analyzing relationships between node labels...")
for source_label in schema_info["node_labels"]:
for target_label in schema_info["node_labels"]:
for rel_type in schema_info["relationship_types"]:
# Check if this relationship exists between these labels
result = session.run(f"""
MATCH (a:{source_label})-[r:{rel_type}]->(b:{target_label})
RETURN count(r) as count LIMIT 1
""")
count = result.single()["count"]
if count > 0:
schema_info["node_relationships"][source_label][target_label].append({
"type": rel_type,
"count": count
})
# Convert defaultdict to regular dict for JSON serialization
schema_info["node_relationships"] = {k: dict(v) for k, v in schema_info["node_relationships"].items()}
schema_info["node_properties"] = dict(schema_info["node_properties"])
schema_info["relationship_properties"] = dict(schema_info["relationship_properties"])
# Generate schema diagram data
diagram_data = generate_diagram_data(schema_info)
# Save all schema information as JSON using the custom encoder
schema_file = os.path.join(output_dir, "neo4j_schema.json")
with open(schema_file, "w") as f:
json.dump(schema_info, f, indent=2, cls=Neo4jEncoder)
# Save diagram data using the custom encoder
diagram_file = os.path.join(output_dir, "neo4j_diagram.json")
with open(diagram_file, "w") as f:
json.dump(diagram_data, f, indent=2, cls=Neo4jEncoder)
# Generate HTML report
generate_html_report(schema_info, output_dir)
# Generate Python code snippets
generate_python_snippets(schema_info, output_dir)
# Generate Cypher query examples
generate_cypher_examples(schema_info, output_dir)
print(f"Schema report generated in {output_dir}")
print(f" - Full schema: {schema_file}")
print(f" - Diagram data: {diagram_file}")
print(f" - HTML report: {os.path.join(output_dir, 'neo4j_schema_report.html')}")
print(f" - Python snippets: {os.path.join(output_dir, 'neo4j_python_snippets.py')}")
print(f" - Cypher examples: {os.path.join(output_dir, 'neo4j_cypher_examples.cypher')}")
except Exception as e:
print(f"Error generating schema report: {str(e)}")
finally:
driver.close()
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
neo4j_uri |
- | 'bolt://localhost:7687' | positional_or_keyword |
neo4j_username |
- | 'neo4j' | positional_or_keyword |
neo4j_password |
- | 'password' | positional_or_keyword |
output_dir |
- | './neo4j_schema' | positional_or_keyword |
Parameter Details
neo4j_uri: The connection URI for the Neo4j database server. Expected format is 'bolt://hostname:port' or 'neo4j://hostname:port'. Default is 'bolt://localhost:7687' for local development instances.
neo4j_username: The username for authenticating with the Neo4j database. Default is 'neo4j' which is the standard default username for Neo4j installations.
neo4j_password: The password for authenticating with the Neo4j database. Default is 'password' but should be changed to match your actual database password.
output_dir: The directory path where all generated report files will be saved. The directory will be created if it doesn't exist. Default is './neo4j_schema' in the current working directory.
Return Value
This function does not return any value (implicitly returns None). Instead, it generates multiple files in the specified output directory: 'neo4j_schema.json' (complete schema information), 'neo4j_diagram.json' (diagram visualization data), 'neo4j_schema_report.html' (human-readable HTML report), 'neo4j_python_snippets.py' (Python code examples), and 'neo4j_cypher_examples.cypher' (Cypher query examples). The function prints status messages and file locations to stdout.
Dependencies
neo4jpandasosjsonsyscollectionsdatetime
Required Imports
import os
import json
from neo4j import GraphDatabase
from collections import defaultdict
Conditional/Optional Imports
These imports are only needed under specific conditions:
from neo4j import time
Condition: Used for handling Neo4j temporal types in the Neo4jEncoder class (if present in the codebase)
Optionalimport pandas as pd
Condition: May be used in helper functions like generate_html_report or generate_python_snippets
Optionalfrom datetime import datetime
Condition: May be used for timestamping reports or handling date properties
Optionalimport sys
Condition: May be used for error handling or system-level operations
OptionalUsage Example
# Basic usage with default local Neo4j instance
generate_neo4j_schema_report(
neo4j_uri="bolt://localhost:7687",
neo4j_username="neo4j",
neo4j_password="your_password",
output_dir="./schema_reports"
)
# Usage with remote Neo4j instance
generate_neo4j_schema_report(
neo4j_uri="neo4j://production-server.example.com:7687",
neo4j_username="admin",
neo4j_password="secure_password",
output_dir="./production_schema"
)
# Minimal usage with defaults (requires password change)
generate_neo4j_schema_report(neo4j_password="my_password")
Best Practices
- Always use strong passwords instead of the default 'password' value
- Ensure the Neo4j database is accessible before running to avoid connection errors
- For large databases, this function may take significant time to complete as it analyzes all labels and relationships
- The function requires several helper functions (generate_diagram_data, generate_html_report, generate_python_snippets, generate_cypher_examples) and a Neo4jEncoder class to be defined in the same module
- Use appropriate error handling when calling this function in production environments
- Consider running this function during off-peak hours for production databases to minimize performance impact
- The output directory will be created automatically, but ensure the parent directory exists and has write permissions
- The function handles different Neo4j versions (3.x, 4.x+) with fallback mechanisms for constraints and indexes
- Close the Neo4j driver connection properly (handled in finally block) to avoid resource leaks
- Review generated files for sensitive information before sharing, as they may contain sample data from the database
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function generate_diagram_data 68.5% similar
-
function generate_cypher_examples 66.1% similar
-
function generate_python_snippets 65.3% similar
-
function generate_html_report 64.3% similar
-
function initialize_schema 60.1% similar