diff --git a/.husky/pre-commit b/.husky/pre-commit index d7c0624ab..fb5d71c12 100755 --- a/.husky/pre-commit +++ b/.husky/pre-commit @@ -2,3 +2,5 @@ ./mvnw sortpom:sort spotless:apply -f .teamcity ./mvnw sortpom:sort spotless:apply + +git update-index --again \ No newline at end of file diff --git a/.snyk b/.snyk index 12c4bfc09..c3ba34984 100644 --- a/.snyk +++ b/.snyk @@ -7,10 +7,14 @@ ignore: reason: Spark Core is provided dependency expires: 2050-01-01T00:00:00.000Z created: 2025-09-18T08:33:31.014 - - 'org.apache.spark:spark-core_2.12': <<: *spark-core - 'SNYK-JAVA-COMFASTERXMLJACKSONCORE-7569538': *spark-core-exclusions - 'SNYK-JAVA-COMGOOGLEPROTOBUF-8055227': *spark-core-exclusions - 'SNYK-JAVA-ORGAPACHEIVY-5847858': *spark-core-exclusions - 'SNYK-JAVA-ORGAPACHEZOOKEEPER-5961102': *spark-core-exclusions + 'SNYK-JAVA-ORGGLASSFISHJERSEYCORE-14049172': *spark-core-exclusions + 'SNYK-JAVA-IOAIRLIFT-14412703': &spark-sql-exclusions + - 'org.apache.spark:spark-sql_2.13': &spark-sql + reason: Spark SQL is provided dependency + expires: 2050-01-01T00:00:00.000Z + created: 2025-09-18T08:35:12.345 + <<: *spark-sql + 'SNYK-JAVA-ORGLZ4-14151788': *spark-core-exclusions + 'SNYK-JAVA-ORGLZ4-14219384': *spark-core-exclusions patch: {} diff --git a/README.md b/README.md index c0725982f..911dd827c 100644 --- a/README.md +++ b/README.md @@ -10,27 +10,25 @@ This neo4j-connector-apache-spark is Apache 2 Licensed The documentation for Neo4j Connector for Apache Spark lives at https://github.com/neo4j/docs-spark repository. -## Building for Spark 3 +## Building for Spark 4 -You can build for Spark 3.x with both Scala 2.12 and Scala 2.13 +You can build for Spark 4.x with Scala 2.13 ``` -./maven-release.sh package 2.12 ./maven-release.sh package 2.13 ``` These commands will generate the corresponding targets -* `spark-3/target/neo4j-connector-apache-spark_2.12-_for_spark_3.jar` -* `spark-3/target/neo4j-connector-apache-spark_2.13-_for_spark_3.jar` +* `spark/target/neo4j-connector-apache-spark_2.13-_for_spark_4.jar` ## Integration with Apache Spark Applications **spark-shell, pyspark, or spark-submit** -`$SPARK_HOME/bin/spark-shell --jars neo4j-connector-apache-spark_2.12-_for_spark_3.jar` +`$SPARK_HOME/bin/spark-shell --jars neo4j-connector-apache-spark_2.12-_for_spark_4.jar` -`$SPARK_HOME/bin/spark-shell --packages org.neo4j:neo4j-connector-apache-spark_2.12:_for_spark_3` +`$SPARK_HOME/bin/spark-shell --packages org.neo4j:neo4j-connector-apache-spark_2.13:_for_spark_4` **sbt** @@ -38,7 +36,7 @@ If you use the [sbt-spark-package plugin](https://github.com/databricks/sbt-spar ```scala resolvers += "Spark Packages Repo" at "http://dl.bintray.com/spark-packages/maven" -libraryDependencies += "org.neo4j" % "neo4j-connector-apache-spark_2.12" % "_for_spark_3" +libraryDependencies += "org.neo4j" % "neo4j-connector-apache-spark_2.13" % "_for_spark_4" ``` **maven** @@ -50,8 +48,8 @@ In your pom.xml, add: org.neo4j - neo4j-connector-apache-spark_2.12 - [version]_for_spark_3 + neo4j-connector-apache-spark_2.13 + [version]_for_spark_4 ``` diff --git a/common/LICENSES.txt b/common/LICENSES.txt index c5f11bab7..3ae89a5a4 100644 --- a/common/LICENSES.txt +++ b/common/LICENSES.txt @@ -4,7 +4,7 @@ libraries. For an overview of the licenses see the NOTICE.txt file. ------------------------------------------------------------------------------ Apache Software License, Version 2.0 - IntelliJ IDEA Annotations + JetBrains Java Annotations Kotlin Stdlib Neo4j Bolt Connection (Bolt Provider reference impl) Neo4j Bolt Connection (Pooled Source impl) diff --git a/common/NOTICE.txt b/common/NOTICE.txt index ac66abf12..bd0cdbcfb 100644 --- a/common/NOTICE.txt +++ b/common/NOTICE.txt @@ -19,7 +19,7 @@ Third-party licenses -------------------- Apache Software License, Version 2.0 - IntelliJ IDEA Annotations + JetBrains Java Annotations Kotlin Stdlib Neo4j Bolt Connection (Bolt Provider reference impl) Neo4j Bolt Connection (Pooled Source impl) diff --git a/common/src/main/scala/org/neo4j/spark/service/SchemaService.scala b/common/src/main/scala/org/neo4j/spark/service/SchemaService.scala index fdec95dab..05452159e 100644 --- a/common/src/main/scala/org/neo4j/spark/service/SchemaService.scala +++ b/common/src/main/scala/org/neo4j/spark/service/SchemaService.scala @@ -734,7 +734,7 @@ class SchemaService( tx => { tx.run( s"CREATE CONSTRAINT $constraintName IF NOT EXISTS FOR $asciiRepresentation REQUIRE ($props) IS $constraintType" - ) + ).consume() }, sessionTransactionConfig ) diff --git a/common/src/test/scala/org/neo4j/spark/util/ValidationsTest.scala b/common/src/test/scala/org/neo4j/spark/util/ValidationsTest.scala index 793403c4f..7d4685c1c 100644 --- a/common/src/test/scala/org/neo4j/spark/util/ValidationsTest.scala +++ b/common/src/test/scala/org/neo4j/spark/util/ValidationsTest.scala @@ -30,7 +30,7 @@ class ValidationsTest extends SparkConnectorScalaBaseTSE { .map { _.version } .getOrElse("UNKNOWN") try { - Validations.validate(ValidateSparkMinVersion("3.10000")) + Validations.validate(ValidateSparkMinVersion("4.10000")) fail(s"should be thrown a ${classOf[IllegalArgumentException].getName}") } catch { case e: IllegalArgumentException => diff --git a/examples/neo4j_data_engineering.ipynb b/examples/neo4j_data_engineering.ipynb index f4ff13e24..d4bfa8e2b 100644 --- a/examples/neo4j_data_engineering.ipynb +++ b/examples/neo4j_data_engineering.ipynb @@ -1,3041 +1,3041 @@ { - "cells": [ - { - "cell_type": "markdown", - "source": [ - "Open this notebook in Google Colab \n", - " \"Open\n", - "" - ], - "metadata": { - "id": "EhTThKJMxDCy" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7Nvb-_bYx359" - }, - "source": [ - "# Example of a Simple data engineering workflow with Neo4j and Spark" - ] - }, - { - "cell_type": "markdown", - "source": [ - "This notebook contains a set of examples that explains how to extract insights from data using the Neo4j Connector for Apache Spark in a Data Engineering workflow with [AuraDB](https://neo4j.com/docs/aura/auradb/) our fully managed version of Neo4j database.\n", - "\n", - "The notebooks will enable you to test your knowledge with a set of exercises after each section.\n", - "\n", - "If you have any questions or problems feel free to write a post in the [Neo4j community forum](https://community.neo4j.com/) or in [Discord](https://discord.com/invite/neo4j).\n", - "\n", - "If you want more exercises feel free to open an issue in the [GitHub repository](https://github.com/neo4j/neo4j-spark-connector).\n", - "\n", - "Enjoy!" - ], - "metadata": { - "id": "e0bo6ido8tL7" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hXwkjQMnMXED" - }, - "source": [ - "### Configure the Spark Environment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BhZwh-RAz6Bo" - }, - "outputs": [], - "source": [ - "!apt-get install openjdk-17-jdk-headless -qq > /dev/null" - ] - }, - { - "cell_type": "code", - "source": [ - "spark_version = '3.3.4'" - ], - "metadata": { - "id": "gmEzhrux7Jek" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!wget -q https://dlcdn.apache.org/spark/spark-$spark_version/spark-$spark_version-bin-hadoop3.tgz" - ], - "metadata": { - "id": "Ya6Nj_u3vdTL" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "A3gsnSHl0F99" - }, - "outputs": [], - "source": [ - "!tar xf spark-$spark_version-bin-hadoop3.tgz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hSBQWKs90vSx" - }, - "outputs": [], - "source": [ - "!pip install -q findspark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tnW0a1Gj080k" - }, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-17-openjdk-amd64\"\n", - "os.environ[\"SPARK_HOME\"] = f\"/content/spark-{spark_version}-bin-hadoop3\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dlUBSezK1DpZ" - }, - "outputs": [], - "source": [ - "import findspark\n", - "findspark.init()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rd5KWGQiOVDV" - }, - "outputs": [], - "source": [ - "neo4j_url = \"\" # put your neo4j url here" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uXbi_82KOTzU" - }, - "outputs": [], - "source": [ - "neo4j_user = \"neo4j\" # put your neo4j user here" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Sw50wjxxOUqt" - }, - "outputs": [], - "source": [ - "neo4j_password = \"\" # put your neo4j password here" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dOUJ-W871Tur" - }, - "outputs": [], - "source": [ - "from pyspark.sql import SparkSession\n", - "spark = (SparkSession.builder\n", - " .master('local[*]')\n", - " .appName('Data engineering workflow with Neo4j and Spark')\n", - " .config('spark.ui.port', '4050')\n", - " # Just to show dataframes as tables\n", - " .config('spark.sql.repl.eagerEval.enabled', True)\n", - " .config('spark.jars.packages', 'org.neo4j:neo4j-connector-apache-spark_2.12:5.1.0_for_spark_3')\n", - " # As we're using always the same database instance we'll\n", - " # define them as global variables\n", - " # so we don't need to repeat them each time\n", - " .config(\"neo4j.url\", neo4j_url)\n", - " .config(\"neo4j.authentication.type\", \"basic\")\n", - " .config(\"neo4j.authentication.basic.username\", neo4j_user)\n", - " .config(\"neo4j.authentication.basic.password\", neo4j_password)\n", - " .getOrCreate())\n", - "spark" - ] - }, - { - "cell_type": "markdown", - "source": [ - "\n", - "## Exercises prerequisite\n", - "\n", - "In this notebook we and going to test your knowledge. Some of the exercises require the Neo4j Python driver to check if the exercises are being solved correctly.\n", - "\n", - "*Neo4j Python Driver is required only for verifying the exercises when you persist data from Spark to Neo4j*\n", - "\n", - "**It's not required by the Spark connector!!!**\n", - "\n", - "We'll use [Cy2Py](https://github.com/conker84/cy2py), a Jupyter extension that easily allows you to connect to Neo4j and visualize data from Jupyter notebooks.\n", - "For a detailed instruction about how to use it please dive into [this example](https://github.com/conker84/cy2py/blob/main/examples/Neo4j_Crime_Investigation_Dataset.ipynb)" - ], - "metadata": { - "id": "b6_YNZnZ5GdT" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install -q cy2py" - ], - "metadata": { - "id": "f5ZZJylo5Bbz" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CsnO4C9X7vK0" - }, - "source": [ - "### Configure an Aura instance\n", - "\n", - "
\n", - "

Neo4j Aura DB is a fully managed cloud service: The zero-admin, always-on graph database for cloud developers.

\n", - "\n", - "Create a [free instance](https://console.neo4j.io/?ref=aura-lp&mpp=4bfb2414ab973c741b6f067bf06d5575&mpid=17f40ce03ac883-0f09bb214466c1-37677109-1ea000-17f40ce03ad975&_gl=1*ql4f6s*_ga*MTc2OTMwNjEwMy4xNjQ5NDI3MDE0*_ga_DL38Q8KGQC*MTY1MzQxMDQzMC43OS4xLjE2NTM0MTA3MjQuMA..&_ga=2.136543024.1659283742.1653295079-1769306103.1649427014&_gac=1.216269284.1653306922.CjwKCAjw4ayUBhA4EiwATWyBrl6dN0oaH9_btCfvzdhi77ieNP07GAkOYuz7wx9QEewBnG_FUIMg8xoCgLsQAvD_BwE)\n", - "\n", - "
" - ] - }, - { - "cell_type": "markdown", - "source": [ - "let's load the extension" - ], - "metadata": { - "id": "uKYEPEgOcG2b" - } - }, - { - "cell_type": "code", - "source": [ - "%load_ext cy2py" - ], - "metadata": { - "id": "38EeXF6icKOK" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Populate the database\n", - "\n", - "To perform this section go in the Neo4j Brower of your aura instance and paste the following query:\n", - "\n", - "
\n", - "\n", - "Show the Cypher query\n", - "\n", - "\n", - "```cypher\n", - "CREATE (TheMatrix:Movie {title:'The Matrix', released:1999, tagline:'Welcome to the Real World'})\n", - "CREATE (Keanu:Person {name:'Keanu Reeves', born:1964})\n", - "CREATE (Carrie:Person {name:'Carrie-Anne Moss', born:1967})\n", - "CREATE (Laurence:Person {name:'Laurence Fishburne', born:1961})\n", - "CREATE (Hugo:Person {name:'Hugo Weaving', born:1960})\n", - "CREATE (LillyW:Person {name:'Lilly Wachowski', born:1967})\n", - "CREATE (LanaW:Person {name:'Lana Wachowski', born:1965})\n", - "CREATE (JoelS:Person {name:'Joel Silver', born:1952})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrix),\n", - "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrix),\n", - "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrix),\n", - "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrix),\n", - "(LillyW)-[:DIRECTED]->(TheMatrix),\n", - "(LanaW)-[:DIRECTED]->(TheMatrix),\n", - "(JoelS)-[:PRODUCED]->(TheMatrix)\n", - "\n", - "CREATE (Emil:Person {name:\"Emil Eifrem\", born:1978})\n", - "CREATE (Emil)-[:ACTED_IN {roles:[\"Emil\"]}]->(TheMatrix)\n", - "\n", - "CREATE (TheMatrixReloaded:Movie {title:'The Matrix Reloaded', released:2003, tagline:'Free your mind'})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrixReloaded),\n", - "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrixReloaded),\n", - "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrixReloaded),\n", - "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrixReloaded),\n", - "(LillyW)-[:DIRECTED]->(TheMatrixReloaded),\n", - "(LanaW)-[:DIRECTED]->(TheMatrixReloaded),\n", - "(JoelS)-[:PRODUCED]->(TheMatrixReloaded)\n", - "\n", - "CREATE (TheMatrixRevolutions:Movie {title:'The Matrix Revolutions', released:2003, tagline:'Everything that has a beginning has an end'})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrixRevolutions),\n", - "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrixRevolutions),\n", - "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrixRevolutions),\n", - "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrixRevolutions),\n", - "(LillyW)-[:DIRECTED]->(TheMatrixRevolutions),\n", - "(LanaW)-[:DIRECTED]->(TheMatrixRevolutions),\n", - "(JoelS)-[:PRODUCED]->(TheMatrixRevolutions)\n", - "\n", - "CREATE (TheDevilsAdvocate:Movie {title:\"The Devil's Advocate\", released:1997, tagline:'Evil has its winning ways'})\n", - "CREATE (Charlize:Person {name:'Charlize Theron', born:1975})\n", - "CREATE (Al:Person {name:'Al Pacino', born:1940})\n", - "CREATE (Taylor:Person {name:'Taylor Hackford', born:1944})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Kevin Lomax']}]->(TheDevilsAdvocate),\n", - "(Charlize)-[:ACTED_IN {roles:['Mary Ann Lomax']}]->(TheDevilsAdvocate),\n", - "(Al)-[:ACTED_IN {roles:['John Milton']}]->(TheDevilsAdvocate),\n", - "(Taylor)-[:DIRECTED]->(TheDevilsAdvocate)\n", - "\n", - "CREATE (AFewGoodMen:Movie {title:\"A Few Good Men\", released:1992, tagline:\"In the heart of the nation's capital, in a courthouse of the U.S. government, one man will stop at nothing to keep his honor, and one will stop at nothing to find the truth.\"})\n", - "CREATE (TomC:Person {name:'Tom Cruise', born:1962})\n", - "CREATE (JackN:Person {name:'Jack Nicholson', born:1937})\n", - "CREATE (DemiM:Person {name:'Demi Moore', born:1962})\n", - "CREATE (KevinB:Person {name:'Kevin Bacon', born:1958})\n", - "CREATE (KieferS:Person {name:'Kiefer Sutherland', born:1966})\n", - "CREATE (NoahW:Person {name:'Noah Wyle', born:1971})\n", - "CREATE (CubaG:Person {name:'Cuba Gooding Jr.', born:1968})\n", - "CREATE (KevinP:Person {name:'Kevin Pollak', born:1957})\n", - "CREATE (JTW:Person {name:'J.T. Walsh', born:1943})\n", - "CREATE (JamesM:Person {name:'James Marshall', born:1967})\n", - "CREATE (ChristopherG:Person {name:'Christopher Guest', born:1948})\n", - "CREATE (RobR:Person {name:'Rob Reiner', born:1947})\n", - "CREATE (AaronS:Person {name:'Aaron Sorkin', born:1961})\n", - "CREATE\n", - "(TomC)-[:ACTED_IN {roles:['Lt. Daniel Kaffee']}]->(AFewGoodMen),\n", - "(JackN)-[:ACTED_IN {roles:['Col. Nathan R. Jessup']}]->(AFewGoodMen),\n", - "(DemiM)-[:ACTED_IN {roles:['Lt. Cdr. JoAnne Galloway']}]->(AFewGoodMen),\n", - "(KevinB)-[:ACTED_IN {roles:['Capt. Jack Ross']}]->(AFewGoodMen),\n", - "(KieferS)-[:ACTED_IN {roles:['Lt. Jonathan Kendrick']}]->(AFewGoodMen),\n", - "(NoahW)-[:ACTED_IN {roles:['Cpl. Jeffrey Barnes']}]->(AFewGoodMen),\n", - "(CubaG)-[:ACTED_IN {roles:['Cpl. Carl Hammaker']}]->(AFewGoodMen),\n", - "(KevinP)-[:ACTED_IN {roles:['Lt. Sam Weinberg']}]->(AFewGoodMen),\n", - "(JTW)-[:ACTED_IN {roles:['Lt. Col. Matthew Andrew Markinson']}]->(AFewGoodMen),\n", - "(JamesM)-[:ACTED_IN {roles:['Pfc. Louden Downey']}]->(AFewGoodMen),\n", - "(ChristopherG)-[:ACTED_IN {roles:['Dr. Stone']}]->(AFewGoodMen),\n", - "(AaronS)-[:ACTED_IN {roles:['Man in Bar']}]->(AFewGoodMen),\n", - "(RobR)-[:DIRECTED]->(AFewGoodMen),\n", - "(AaronS)-[:WROTE]->(AFewGoodMen)\n", - "\n", - "CREATE (TopGun:Movie {title:\"Top Gun\", released:1986, tagline:'I feel the need, the need for speed.'})\n", - "CREATE (KellyM:Person {name:'Kelly McGillis', born:1957})\n", - "CREATE (ValK:Person {name:'Val Kilmer', born:1959})\n", - "CREATE (AnthonyE:Person {name:'Anthony Edwards', born:1962})\n", - "CREATE (TomS:Person {name:'Tom Skerritt', born:1933})\n", - "CREATE (MegR:Person {name:'Meg Ryan', born:1961})\n", - "CREATE (TonyS:Person {name:'Tony Scott', born:1944})\n", - "CREATE (JimC:Person {name:'Jim Cash', born:1941})\n", - "CREATE\n", - "(TomC)-[:ACTED_IN {roles:['Maverick']}]->(TopGun),\n", - "(KellyM)-[:ACTED_IN {roles:['Charlie']}]->(TopGun),\n", - "(ValK)-[:ACTED_IN {roles:['Iceman']}]->(TopGun),\n", - "(AnthonyE)-[:ACTED_IN {roles:['Goose']}]->(TopGun),\n", - "(TomS)-[:ACTED_IN {roles:['Viper']}]->(TopGun),\n", - "(MegR)-[:ACTED_IN {roles:['Carole']}]->(TopGun),\n", - "(TonyS)-[:DIRECTED]->(TopGun),\n", - "(JimC)-[:WROTE]->(TopGun)\n", - "\n", - "CREATE (JerryMaguire:Movie {title:'Jerry Maguire', released:2000, tagline:'The rest of his life begins now.'})\n", - "CREATE (ReneeZ:Person {name:'Renee Zellweger', born:1969})\n", - "CREATE (KellyP:Person {name:'Kelly Preston', born:1962})\n", - "CREATE (JerryO:Person {name:\"Jerry O'Connell\", born:1974})\n", - "CREATE (JayM:Person {name:'Jay Mohr', born:1970})\n", - "CREATE (BonnieH:Person {name:'Bonnie Hunt', born:1961})\n", - "CREATE (ReginaK:Person {name:'Regina King', born:1971})\n", - "CREATE (JonathanL:Person {name:'Jonathan Lipnicki', born:1996})\n", - "CREATE (CameronC:Person {name:'Cameron Crowe', born:1957})\n", - "CREATE\n", - "(TomC)-[:ACTED_IN {roles:['Jerry Maguire']}]->(JerryMaguire),\n", - "(CubaG)-[:ACTED_IN {roles:['Rod Tidwell']}]->(JerryMaguire),\n", - "(ReneeZ)-[:ACTED_IN {roles:['Dorothy Boyd']}]->(JerryMaguire),\n", - "(KellyP)-[:ACTED_IN {roles:['Avery Bishop']}]->(JerryMaguire),\n", - "(JerryO)-[:ACTED_IN {roles:['Frank Cushman']}]->(JerryMaguire),\n", - "(JayM)-[:ACTED_IN {roles:['Bob Sugar']}]->(JerryMaguire),\n", - "(BonnieH)-[:ACTED_IN {roles:['Laurel Boyd']}]->(JerryMaguire),\n", - "(ReginaK)-[:ACTED_IN {roles:['Marcee Tidwell']}]->(JerryMaguire),\n", - "(JonathanL)-[:ACTED_IN {roles:['Ray Boyd']}]->(JerryMaguire),\n", - "(CameronC)-[:DIRECTED]->(JerryMaguire),\n", - "(CameronC)-[:PRODUCED]->(JerryMaguire),\n", - "(CameronC)-[:WROTE]->(JerryMaguire)\n", - "\n", - "CREATE (StandByMe:Movie {title:\"Stand By Me\", released:1986, tagline:\"For some, it's the last real taste of innocence, and the first real taste of life. But for everyone, it's the time that memories are made of.\"})\n", - "CREATE (RiverP:Person {name:'River Phoenix', born:1970})\n", - "CREATE (CoreyF:Person {name:'Corey Feldman', born:1971})\n", - "CREATE (WilW:Person {name:'Wil Wheaton', born:1972})\n", - "CREATE (JohnC:Person {name:'John Cusack', born:1966})\n", - "CREATE (MarshallB:Person {name:'Marshall Bell', born:1942})\n", - "CREATE\n", - "(WilW)-[:ACTED_IN {roles:['Gordie Lachance']}]->(StandByMe),\n", - "(RiverP)-[:ACTED_IN {roles:['Chris Chambers']}]->(StandByMe),\n", - "(JerryO)-[:ACTED_IN {roles:['Vern Tessio']}]->(StandByMe),\n", - "(CoreyF)-[:ACTED_IN {roles:['Teddy Duchamp']}]->(StandByMe),\n", - "(JohnC)-[:ACTED_IN {roles:['Denny Lachance']}]->(StandByMe),\n", - "(KieferS)-[:ACTED_IN {roles:['Ace Merrill']}]->(StandByMe),\n", - "(MarshallB)-[:ACTED_IN {roles:['Mr. Lachance']}]->(StandByMe),\n", - "(RobR)-[:DIRECTED]->(StandByMe)\n", - "\n", - "CREATE (AsGoodAsItGets:Movie {title:'As Good as It Gets', released:1997, tagline:'A comedy from the heart that goes for the throat.'})\n", - "CREATE (HelenH:Person {name:'Helen Hunt', born:1963})\n", - "CREATE (GregK:Person {name:'Greg Kinnear', born:1963})\n", - "CREATE (JamesB:Person {name:'James L. Brooks', born:1940})\n", - "CREATE\n", - "(JackN)-[:ACTED_IN {roles:['Melvin Udall']}]->(AsGoodAsItGets),\n", - "(HelenH)-[:ACTED_IN {roles:['Carol Connelly']}]->(AsGoodAsItGets),\n", - "(GregK)-[:ACTED_IN {roles:['Simon Bishop']}]->(AsGoodAsItGets),\n", - "(CubaG)-[:ACTED_IN {roles:['Frank Sachs']}]->(AsGoodAsItGets),\n", - "(JamesB)-[:DIRECTED]->(AsGoodAsItGets)\n", - "\n", - "CREATE (WhatDreamsMayCome:Movie {title:'What Dreams May Come', released:1998, tagline:'After life there is more. The end is just the beginning.'})\n", - "CREATE (AnnabellaS:Person {name:'Annabella Sciorra', born:1960})\n", - "CREATE (MaxS:Person {name:'Max von Sydow', born:1929})\n", - "CREATE (WernerH:Person {name:'Werner Herzog', born:1942})\n", - "CREATE (Robin:Person {name:'Robin Williams', born:1951})\n", - "CREATE (VincentW:Person {name:'Vincent Ward', born:1956})\n", - "CREATE\n", - "(Robin)-[:ACTED_IN {roles:['Chris Nielsen']}]->(WhatDreamsMayCome),\n", - "(CubaG)-[:ACTED_IN {roles:['Albert Lewis']}]->(WhatDreamsMayCome),\n", - "(AnnabellaS)-[:ACTED_IN {roles:['Annie Collins-Nielsen']}]->(WhatDreamsMayCome),\n", - "(MaxS)-[:ACTED_IN {roles:['The Tracker']}]->(WhatDreamsMayCome),\n", - "(WernerH)-[:ACTED_IN {roles:['The Face']}]->(WhatDreamsMayCome),\n", - "(VincentW)-[:DIRECTED]->(WhatDreamsMayCome)\n", - "\n", - "CREATE (SnowFallingonCedars:Movie {title:'Snow Falling on Cedars', released:1999, tagline:'First loves last. Forever.'})\n", - "CREATE (EthanH:Person {name:'Ethan Hawke', born:1970})\n", - "CREATE (RickY:Person {name:'Rick Yune', born:1971})\n", - "CREATE (JamesC:Person {name:'James Cromwell', born:1940})\n", - "CREATE (ScottH:Person {name:'Scott Hicks', born:1953})\n", - "CREATE\n", - "(EthanH)-[:ACTED_IN {roles:['Ishmael Chambers']}]->(SnowFallingonCedars),\n", - "(RickY)-[:ACTED_IN {roles:['Kazuo Miyamoto']}]->(SnowFallingonCedars),\n", - "(MaxS)-[:ACTED_IN {roles:['Nels Gudmundsson']}]->(SnowFallingonCedars),\n", - "(JamesC)-[:ACTED_IN {roles:['Judge Fielding']}]->(SnowFallingonCedars),\n", - "(ScottH)-[:DIRECTED]->(SnowFallingonCedars)\n", - "\n", - "CREATE (YouveGotMail:Movie {title:\"You've Got Mail\", released:1998, tagline:'At odds in life... in love on-line.'})\n", - "CREATE (ParkerP:Person {name:'Parker Posey', born:1968})\n", - "CREATE (DaveC:Person {name:'Dave Chappelle', born:1973})\n", - "CREATE (SteveZ:Person {name:'Steve Zahn', born:1967})\n", - "CREATE (TomH:Person {name:'Tom Hanks', born:1956})\n", - "CREATE (NoraE:Person {name:'Nora Ephron', born:1941})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Joe Fox']}]->(YouveGotMail),\n", - "(MegR)-[:ACTED_IN {roles:['Kathleen Kelly']}]->(YouveGotMail),\n", - "(GregK)-[:ACTED_IN {roles:['Frank Navasky']}]->(YouveGotMail),\n", - "(ParkerP)-[:ACTED_IN {roles:['Patricia Eden']}]->(YouveGotMail),\n", - "(DaveC)-[:ACTED_IN {roles:['Kevin Jackson']}]->(YouveGotMail),\n", - "(SteveZ)-[:ACTED_IN {roles:['George Pappas']}]->(YouveGotMail),\n", - "(NoraE)-[:DIRECTED]->(YouveGotMail)\n", - "\n", - "CREATE (SleeplessInSeattle:Movie {title:'Sleepless in Seattle', released:1993, tagline:'What if someone you never met, someone you never saw, someone you never knew was the only someone for you?'})\n", - "CREATE (RitaW:Person {name:'Rita Wilson', born:1956})\n", - "CREATE (BillPull:Person {name:'Bill Pullman', born:1953})\n", - "CREATE (VictorG:Person {name:'Victor Garber', born:1949})\n", - "CREATE (RosieO:Person {name:\"Rosie O'Donnell\", born:1962})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Sam Baldwin']}]->(SleeplessInSeattle),\n", - "(MegR)-[:ACTED_IN {roles:['Annie Reed']}]->(SleeplessInSeattle),\n", - "(RitaW)-[:ACTED_IN {roles:['Suzy']}]->(SleeplessInSeattle),\n", - "(BillPull)-[:ACTED_IN {roles:['Walter']}]->(SleeplessInSeattle),\n", - "(VictorG)-[:ACTED_IN {roles:['Greg']}]->(SleeplessInSeattle),\n", - "(RosieO)-[:ACTED_IN {roles:['Becky']}]->(SleeplessInSeattle),\n", - "(NoraE)-[:DIRECTED]->(SleeplessInSeattle)\n", - "\n", - "CREATE (JoeVersustheVolcano:Movie {title:'Joe Versus the Volcano', released:1990, tagline:'A story of love, lava and burning desire.'})\n", - "CREATE (JohnS:Person {name:'John Patrick Stanley', born:1950})\n", - "CREATE (Nathan:Person {name:'Nathan Lane', born:1956})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Joe Banks']}]->(JoeVersustheVolcano),\n", - "(MegR)-[:ACTED_IN {roles:['DeDe', 'Angelica Graynamore', 'Patricia Graynamore']}]->(JoeVersustheVolcano),\n", - "(Nathan)-[:ACTED_IN {roles:['Baw']}]->(JoeVersustheVolcano),\n", - "(JohnS)-[:DIRECTED]->(JoeVersustheVolcano)\n", - "\n", - "CREATE (WhenHarryMetSally:Movie {title:'When Harry Met Sally', released:1998, tagline:'Can two friends sleep together and still love each other in the morning?'})\n", - "CREATE (BillyC:Person {name:'Billy Crystal', born:1948})\n", - "CREATE (CarrieF:Person {name:'Carrie Fisher', born:1956})\n", - "CREATE (BrunoK:Person {name:'Bruno Kirby', born:1949})\n", - "CREATE\n", - "(BillyC)-[:ACTED_IN {roles:['Harry Burns']}]->(WhenHarryMetSally),\n", - "(MegR)-[:ACTED_IN {roles:['Sally Albright']}]->(WhenHarryMetSally),\n", - "(CarrieF)-[:ACTED_IN {roles:['Marie']}]->(WhenHarryMetSally),\n", - "(BrunoK)-[:ACTED_IN {roles:['Jess']}]->(WhenHarryMetSally),\n", - "(RobR)-[:DIRECTED]->(WhenHarryMetSally),\n", - "(RobR)-[:PRODUCED]->(WhenHarryMetSally),\n", - "(NoraE)-[:PRODUCED]->(WhenHarryMetSally),\n", - "(NoraE)-[:WROTE]->(WhenHarryMetSally)\n", - "\n", - "CREATE (ThatThingYouDo:Movie {title:'That Thing You Do', released:1996, tagline:'In every life there comes a time when that thing you dream becomes that thing you do'})\n", - "CREATE (LivT:Person {name:'Liv Tyler', born:1977})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Mr. White']}]->(ThatThingYouDo),\n", - "(LivT)-[:ACTED_IN {roles:['Faye Dolan']}]->(ThatThingYouDo),\n", - "(Charlize)-[:ACTED_IN {roles:['Tina']}]->(ThatThingYouDo),\n", - "(TomH)-[:DIRECTED]->(ThatThingYouDo)\n", - "\n", - "CREATE (TheReplacements:Movie {title:'The Replacements', released:2000, tagline:'Pain heals, Chicks dig scars... Glory lasts forever'})\n", - "CREATE (Brooke:Person {name:'Brooke Langton', born:1970})\n", - "CREATE (Gene:Person {name:'Gene Hackman', born:1930})\n", - "CREATE (Orlando:Person {name:'Orlando Jones', born:1968})\n", - "CREATE (Howard:Person {name:'Howard Deutch', born:1950})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Shane Falco']}]->(TheReplacements),\n", - "(Brooke)-[:ACTED_IN {roles:['Annabelle Farrell']}]->(TheReplacements),\n", - "(Gene)-[:ACTED_IN {roles:['Jimmy McGinty']}]->(TheReplacements),\n", - "(Orlando)-[:ACTED_IN {roles:['Clifford Franklin']}]->(TheReplacements),\n", - "(Howard)-[:DIRECTED]->(TheReplacements)\n", - "\n", - "CREATE (RescueDawn:Movie {title:'RescueDawn', released:2006, tagline:\"Based on the extraordinary true story of one man's fight for freedom\"})\n", - "CREATE (ChristianB:Person {name:'Christian Bale', born:1974})\n", - "CREATE (ZachG:Person {name:'Zach Grenier', born:1954})\n", - "CREATE\n", - "(MarshallB)-[:ACTED_IN {roles:['Admiral']}]->(RescueDawn),\n", - "(ChristianB)-[:ACTED_IN {roles:['Dieter Dengler']}]->(RescueDawn),\n", - "(ZachG)-[:ACTED_IN {roles:['Squad Leader']}]->(RescueDawn),\n", - "(SteveZ)-[:ACTED_IN {roles:['Duane']}]->(RescueDawn),\n", - "(WernerH)-[:DIRECTED]->(RescueDawn)\n", - "\n", - "CREATE (TheBirdcage:Movie {title:'The Birdcage', released:1996, tagline:'Come as you are'})\n", - "CREATE (MikeN:Person {name:'Mike Nichols', born:1931})\n", - "CREATE\n", - "(Robin)-[:ACTED_IN {roles:['Armand Goldman']}]->(TheBirdcage),\n", - "(Nathan)-[:ACTED_IN {roles:['Albert Goldman']}]->(TheBirdcage),\n", - "(Gene)-[:ACTED_IN {roles:['Sen. Kevin Keeley']}]->(TheBirdcage),\n", - "(MikeN)-[:DIRECTED]->(TheBirdcage)\n", - "\n", - "CREATE (Unforgiven:Movie {title:'Unforgiven', released:1992, tagline:\"It's a hell of a thing, killing a man\"})\n", - "CREATE (RichardH:Person {name:'Richard Harris', born:1930})\n", - "CREATE (ClintE:Person {name:'Clint Eastwood', born:1930})\n", - "CREATE\n", - "(RichardH)-[:ACTED_IN {roles:['English Bob']}]->(Unforgiven),\n", - "(ClintE)-[:ACTED_IN {roles:['Bill Munny']}]->(Unforgiven),\n", - "(Gene)-[:ACTED_IN {roles:['Little Bill Daggett']}]->(Unforgiven),\n", - "(ClintE)-[:DIRECTED]->(Unforgiven)\n", - "\n", - "CREATE (JohnnyMnemonic:Movie {title:'Johnny Mnemonic', released:1995, tagline:'The hottest data on earth. In the coolest head in town'})\n", - "CREATE (Takeshi:Person {name:'Takeshi Kitano', born:1947})\n", - "CREATE (Dina:Person {name:'Dina Meyer', born:1968})\n", - "CREATE (IceT:Person {name:'Ice-T', born:1958})\n", - "CREATE (RobertL:Person {name:'Robert Longo', born:1953})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Johnny Mnemonic']}]->(JohnnyMnemonic),\n", - "(Takeshi)-[:ACTED_IN {roles:['Takahashi']}]->(JohnnyMnemonic),\n", - "(Dina)-[:ACTED_IN {roles:['Jane']}]->(JohnnyMnemonic),\n", - "(IceT)-[:ACTED_IN {roles:['J-Bone']}]->(JohnnyMnemonic),\n", - "(RobertL)-[:DIRECTED]->(JohnnyMnemonic)\n", - "\n", - "CREATE (CloudAtlas:Movie {title:'Cloud Atlas', released:2012, tagline:'Everything is connected'})\n", - "CREATE (HalleB:Person {name:'Halle Berry', born:1966})\n", - "CREATE (JimB:Person {name:'Jim Broadbent', born:1949})\n", - "CREATE (TomT:Person {name:'Tom Tykwer', born:1965})\n", - "CREATE (DavidMitchell:Person {name:'David Mitchell', born:1969})\n", - "CREATE (StefanArndt:Person {name:'Stefan Arndt', born:1961})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Zachry', 'Dr. Henry Goose', 'Isaac Sachs', 'Dermot Hoggins']}]->(CloudAtlas),\n", - "(Hugo)-[:ACTED_IN {roles:['Bill Smoke', 'Haskell Moore', 'Tadeusz Kesselring', 'Nurse Noakes', 'Boardman Mephi', 'Old Georgie']}]->(CloudAtlas),\n", - "(HalleB)-[:ACTED_IN {roles:['Luisa Rey', 'Jocasta Ayrs', 'Ovid', 'Meronym']}]->(CloudAtlas),\n", - "(JimB)-[:ACTED_IN {roles:['Vyvyan Ayrs', 'Captain Molyneux', 'Timothy Cavendish']}]->(CloudAtlas),\n", - "(TomT)-[:DIRECTED]->(CloudAtlas),\n", - "(LillyW)-[:DIRECTED]->(CloudAtlas),\n", - "(LanaW)-[:DIRECTED]->(CloudAtlas),\n", - "(DavidMitchell)-[:WROTE]->(CloudAtlas),\n", - "(StefanArndt)-[:PRODUCED]->(CloudAtlas)\n", - "\n", - "CREATE (TheDaVinciCode:Movie {title:'The Da Vinci Code', released:2006, tagline:'Break The Codes'})\n", - "CREATE (IanM:Person {name:'Ian McKellen', born:1939})\n", - "CREATE (AudreyT:Person {name:'Audrey Tautou', born:1976})\n", - "CREATE (PaulB:Person {name:'Paul Bettany', born:1971})\n", - "CREATE (RonH:Person {name:'Ron Howard', born:1954})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Dr. Robert Langdon']}]->(TheDaVinciCode),\n", - "(IanM)-[:ACTED_IN {roles:['Sir Leight Teabing']}]->(TheDaVinciCode),\n", - "(AudreyT)-[:ACTED_IN {roles:['Sophie Neveu']}]->(TheDaVinciCode),\n", - "(PaulB)-[:ACTED_IN {roles:['Silas']}]->(TheDaVinciCode),\n", - "(RonH)-[:DIRECTED]->(TheDaVinciCode)\n", - "\n", - "CREATE (VforVendetta:Movie {title:'V for Vendetta', released:2006, tagline:'Freedom! Forever!'})\n", - "CREATE (NatalieP:Person {name:'Natalie Portman', born:1981})\n", - "CREATE (StephenR:Person {name:'Stephen Rea', born:1946})\n", - "CREATE (JohnH:Person {name:'John Hurt', born:1940})\n", - "CREATE (BenM:Person {name: 'Ben Miles', born:1967})\n", - "CREATE\n", - "(Hugo)-[:ACTED_IN {roles:['V']}]->(VforVendetta),\n", - "(NatalieP)-[:ACTED_IN {roles:['Evey Hammond']}]->(VforVendetta),\n", - "(StephenR)-[:ACTED_IN {roles:['Eric Finch']}]->(VforVendetta),\n", - "(JohnH)-[:ACTED_IN {roles:['High Chancellor Adam Sutler']}]->(VforVendetta),\n", - "(BenM)-[:ACTED_IN {roles:['Dascomb']}]->(VforVendetta),\n", - "(JamesM)-[:DIRECTED]->(VforVendetta),\n", - "(LillyW)-[:PRODUCED]->(VforVendetta),\n", - "(LanaW)-[:PRODUCED]->(VforVendetta),\n", - "(JoelS)-[:PRODUCED]->(VforVendetta),\n", - "(LillyW)-[:WROTE]->(VforVendetta),\n", - "(LanaW)-[:WROTE]->(VforVendetta)\n", - "\n", - "CREATE (SpeedRacer:Movie {title:'Speed Racer', released:2008, tagline:'Speed has no limits'})\n", - "CREATE (EmileH:Person {name:'Emile Hirsch', born:1985})\n", - "CREATE (JohnG:Person {name:'John Goodman', born:1960})\n", - "CREATE (SusanS:Person {name:'Susan Sarandon', born:1946})\n", - "CREATE (MatthewF:Person {name:'Matthew Fox', born:1966})\n", - "CREATE (ChristinaR:Person {name:'Christina Ricci', born:1980})\n", - "CREATE (Rain:Person {name:'Rain', born:1982})\n", - "CREATE\n", - "(EmileH)-[:ACTED_IN {roles:['Speed Racer']}]->(SpeedRacer),\n", - "(JohnG)-[:ACTED_IN {roles:['Pops']}]->(SpeedRacer),\n", - "(SusanS)-[:ACTED_IN {roles:['Mom']}]->(SpeedRacer),\n", - "(MatthewF)-[:ACTED_IN {roles:['Racer X']}]->(SpeedRacer),\n", - "(ChristinaR)-[:ACTED_IN {roles:['Trixie']}]->(SpeedRacer),\n", - "(Rain)-[:ACTED_IN {roles:['Taejo Togokahn']}]->(SpeedRacer),\n", - "(BenM)-[:ACTED_IN {roles:['Cass Jones']}]->(SpeedRacer),\n", - "(LillyW)-[:DIRECTED]->(SpeedRacer),\n", - "(LanaW)-[:DIRECTED]->(SpeedRacer),\n", - "(LillyW)-[:WROTE]->(SpeedRacer),\n", - "(LanaW)-[:WROTE]->(SpeedRacer),\n", - "(JoelS)-[:PRODUCED]->(SpeedRacer)\n", - "\n", - "CREATE (NinjaAssassin:Movie {title:'Ninja Assassin', released:2009, tagline:'Prepare to enter a secret world of assassins'})\n", - "CREATE (NaomieH:Person {name:'Naomie Harris'})\n", - "CREATE\n", - "(Rain)-[:ACTED_IN {roles:['Raizo']}]->(NinjaAssassin),\n", - "(NaomieH)-[:ACTED_IN {roles:['Mika Coretti']}]->(NinjaAssassin),\n", - "(RickY)-[:ACTED_IN {roles:['Takeshi']}]->(NinjaAssassin),\n", - "(BenM)-[:ACTED_IN {roles:['Ryan Maslow']}]->(NinjaAssassin),\n", - "(JamesM)-[:DIRECTED]->(NinjaAssassin),\n", - "(LillyW)-[:PRODUCED]->(NinjaAssassin),\n", - "(LanaW)-[:PRODUCED]->(NinjaAssassin),\n", - "(JoelS)-[:PRODUCED]->(NinjaAssassin)\n", - "\n", - "CREATE (TheGreenMile:Movie {title:'The Green Mile', released:1999, tagline:\"Walk a mile you'll never forget.\"})\n", - "CREATE (MichaelD:Person {name:'Michael Clarke Duncan', born:1957})\n", - "CREATE (DavidM:Person {name:'David Morse', born:1953})\n", - "CREATE (SamR:Person {name:'Sam Rockwell', born:1968})\n", - "CREATE (GaryS:Person {name:'Gary Sinise', born:1955})\n", - "CREATE (PatriciaC:Person {name:'Patricia Clarkson', born:1959})\n", - "CREATE (FrankD:Person {name:'Frank Darabont', born:1959})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Paul Edgecomb']}]->(TheGreenMile),\n", - "(MichaelD)-[:ACTED_IN {roles:['John Coffey']}]->(TheGreenMile),\n", - "(DavidM)-[:ACTED_IN {roles:['Brutus \"Brutal\" Howell']}]->(TheGreenMile),\n", - "(BonnieH)-[:ACTED_IN {roles:['Jan Edgecomb']}]->(TheGreenMile),\n", - "(JamesC)-[:ACTED_IN {roles:['Warden Hal Moores']}]->(TheGreenMile),\n", - "(SamR)-[:ACTED_IN {roles:['\"Wild Bill\" Wharton']}]->(TheGreenMile),\n", - "(GaryS)-[:ACTED_IN {roles:['Burt Hammersmith']}]->(TheGreenMile),\n", - "(PatriciaC)-[:ACTED_IN {roles:['Melinda Moores']}]->(TheGreenMile),\n", - "(FrankD)-[:DIRECTED]->(TheGreenMile)\n", - "\n", - "CREATE (FrostNixon:Movie {title:'Frost/Nixon', released:2008, tagline:'400 million people were waiting for the truth.'})\n", - "CREATE (FrankL:Person {name:'Frank Langella', born:1938})\n", - "CREATE (MichaelS:Person {name:'Michael Sheen', born:1969})\n", - "CREATE (OliverP:Person {name:'Oliver Platt', born:1960})\n", - "CREATE\n", - "(FrankL)-[:ACTED_IN {roles:['Richard Nixon']}]->(FrostNixon),\n", - "(MichaelS)-[:ACTED_IN {roles:['David Frost']}]->(FrostNixon),\n", - "(KevinB)-[:ACTED_IN {roles:['Jack Brennan']}]->(FrostNixon),\n", - "(OliverP)-[:ACTED_IN {roles:['Bob Zelnick']}]->(FrostNixon),\n", - "(SamR)-[:ACTED_IN {roles:['James Reston, Jr.']}]->(FrostNixon),\n", - "(RonH)-[:DIRECTED]->(FrostNixon)\n", - "\n", - "CREATE (Hoffa:Movie {title:'Hoffa', released:1992, tagline:\"He didn't want law. He wanted justice.\"})\n", - "CREATE (DannyD:Person {name:'Danny DeVito', born:1944})\n", - "CREATE (JohnR:Person {name:'John C. Reilly', born:1965})\n", - "CREATE\n", - "(JackN)-[:ACTED_IN {roles:['Hoffa']}]->(Hoffa),\n", - "(DannyD)-[:ACTED_IN {roles:['Robert \"Bobby\" Ciaro']}]->(Hoffa),\n", - "(JTW)-[:ACTED_IN {roles:['Frank Fitzsimmons']}]->(Hoffa),\n", - "(JohnR)-[:ACTED_IN {roles:['Peter \"Pete\" Connelly']}]->(Hoffa),\n", - "(DannyD)-[:DIRECTED]->(Hoffa)\n", - "\n", - "CREATE (Apollo13:Movie {title:'Apollo 13', released:1995, tagline:'Houston, we have a problem.'})\n", - "CREATE (EdH:Person {name:'Ed Harris', born:1950})\n", - "CREATE (BillPax:Person {name:'Bill Paxton', born:1955})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Jim Lovell']}]->(Apollo13),\n", - "(KevinB)-[:ACTED_IN {roles:['Jack Swigert']}]->(Apollo13),\n", - "(EdH)-[:ACTED_IN {roles:['Gene Kranz']}]->(Apollo13),\n", - "(BillPax)-[:ACTED_IN {roles:['Fred Haise']}]->(Apollo13),\n", - "(GaryS)-[:ACTED_IN {roles:['Ken Mattingly']}]->(Apollo13),\n", - "(RonH)-[:DIRECTED]->(Apollo13)\n", - "\n", - "CREATE (Twister:Movie {title:'Twister', released:1996, tagline:\"Don't Breathe. Don't Look Back.\"})\n", - "CREATE (PhilipH:Person {name:'Philip Seymour Hoffman', born:1967})\n", - "CREATE (JanB:Person {name:'Jan de Bont', born:1943})\n", - "CREATE\n", - "(BillPax)-[:ACTED_IN {roles:['Bill Harding']}]->(Twister),\n", - "(HelenH)-[:ACTED_IN {roles:['Dr. Jo Harding']}]->(Twister),\n", - "(ZachG)-[:ACTED_IN {roles:['Eddie']}]->(Twister),\n", - "(PhilipH)-[:ACTED_IN {roles:['Dustin \"Dusty\" Davis']}]->(Twister),\n", - "(JanB)-[:DIRECTED]->(Twister)\n", - "\n", - "CREATE (CastAway:Movie {title:'Cast Away', released:2000, tagline:'At the edge of the world, his journey begins.'})\n", - "CREATE (RobertZ:Person {name:'Robert Zemeckis', born:1951})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Chuck Noland']}]->(CastAway),\n", - "(HelenH)-[:ACTED_IN {roles:['Kelly Frears']}]->(CastAway),\n", - "(RobertZ)-[:DIRECTED]->(CastAway)\n", - "\n", - "CREATE (OneFlewOvertheCuckoosNest:Movie {title:\"One Flew Over the Cuckoo's Nest\", released:1975, tagline:\"If he's crazy, what does that make you?\"})\n", - "CREATE (MilosF:Person {name:'Milos Forman', born:1932})\n", - "CREATE\n", - "(JackN)-[:ACTED_IN {roles:['Randle McMurphy']}]->(OneFlewOvertheCuckoosNest),\n", - "(DannyD)-[:ACTED_IN {roles:['Martini']}]->(OneFlewOvertheCuckoosNest),\n", - "(MilosF)-[:DIRECTED]->(OneFlewOvertheCuckoosNest)\n", - "\n", - "CREATE (SomethingsGottaGive:Movie {title:\"Something's Gotta Give\", released:2003})\n", - "CREATE (DianeK:Person {name:'Diane Keaton', born:1946})\n", - "CREATE (NancyM:Person {name:'Nancy Meyers', born:1949})\n", - "CREATE\n", - "(JackN)-[:ACTED_IN {roles:['Harry Sanborn']}]->(SomethingsGottaGive),\n", - "(DianeK)-[:ACTED_IN {roles:['Erica Barry']}]->(SomethingsGottaGive),\n", - "(Keanu)-[:ACTED_IN {roles:['Julian Mercer']}]->(SomethingsGottaGive),\n", - "(NancyM)-[:DIRECTED]->(SomethingsGottaGive),\n", - "(NancyM)-[:PRODUCED]->(SomethingsGottaGive),\n", - "(NancyM)-[:WROTE]->(SomethingsGottaGive)\n", - "\n", - "CREATE (BicentennialMan:Movie {title:'Bicentennial Man', released:1999, tagline:\"One robot's 200 year journey to become an ordinary man.\"})\n", - "CREATE (ChrisC:Person {name:'Chris Columbus', born:1958})\n", - "CREATE\n", - "(Robin)-[:ACTED_IN {roles:['Andrew Marin']}]->(BicentennialMan),\n", - "(OliverP)-[:ACTED_IN {roles:['Rupert Burns']}]->(BicentennialMan),\n", - "(ChrisC)-[:DIRECTED]->(BicentennialMan)\n", - "\n", - "CREATE (CharlieWilsonsWar:Movie {title:\"Charlie Wilson's War\", released:2007, tagline:\"A stiff drink. A little mascara. A lot of nerve. Who said they couldn't bring down the Soviet empire.\"})\n", - "CREATE (JuliaR:Person {name:'Julia Roberts', born:1967})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Rep. Charlie Wilson']}]->(CharlieWilsonsWar),\n", - "(JuliaR)-[:ACTED_IN {roles:['Joanne Herring']}]->(CharlieWilsonsWar),\n", - "(PhilipH)-[:ACTED_IN {roles:['Gust Avrakotos']}]->(CharlieWilsonsWar),\n", - "(MikeN)-[:DIRECTED]->(CharlieWilsonsWar)\n", - "\n", - "CREATE (ThePolarExpress:Movie {title:'The Polar Express', released:2004, tagline:'This Holiday Season... Believe'})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Hero Boy', 'Father', 'Conductor', 'Hobo', 'Scrooge', 'Santa Claus']}]->(ThePolarExpress),\n", - "(RobertZ)-[:DIRECTED]->(ThePolarExpress)\n", - "\n", - "CREATE (ALeagueofTheirOwn:Movie {title:'A League of Their Own', released:1992, tagline:'Once in a lifetime you get a chance to do something different.'})\n", - "CREATE (Madonna:Person {name:'Madonna', born:1954})\n", - "CREATE (GeenaD:Person {name:'Geena Davis', born:1956})\n", - "CREATE (LoriP:Person {name:'Lori Petty', born:1963})\n", - "CREATE (PennyM:Person {name:'Penny Marshall', born:1943})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Jimmy Dugan']}]->(ALeagueofTheirOwn),\n", - "(GeenaD)-[:ACTED_IN {roles:['Dottie Hinson']}]->(ALeagueofTheirOwn),\n", - "(LoriP)-[:ACTED_IN {roles:['Kit Keller']}]->(ALeagueofTheirOwn),\n", - "(RosieO)-[:ACTED_IN {roles:['Doris Murphy']}]->(ALeagueofTheirOwn),\n", - "(Madonna)-[:ACTED_IN {roles:['\"All the Way\" Mae Mordabito']}]->(ALeagueofTheirOwn),\n", - "(BillPax)-[:ACTED_IN {roles:['Bob Hinson']}]->(ALeagueofTheirOwn),\n", - "(PennyM)-[:DIRECTED]->(ALeagueofTheirOwn)\n", - "\n", - "CREATE (PaulBlythe:Person {name:'Paul Blythe'})\n", - "CREATE (AngelaScope:Person {name:'Angela Scope'})\n", - "CREATE (JessicaThompson:Person {name:'Jessica Thompson'})\n", - "CREATE (JamesThompson:Person {name:'James Thompson'})\n", - "\n", - "CREATE\n", - "(JamesThompson)-[:FOLLOWS]->(JessicaThompson),\n", - "(AngelaScope)-[:FOLLOWS]->(JessicaThompson),\n", - "(PaulBlythe)-[:FOLLOWS]->(AngelaScope)\n", - "\n", - "CREATE\n", - "(JessicaThompson)-[:REVIEWED {summary:'An amazing journey', rating:95}]->(CloudAtlas),\n", - "(JessicaThompson)-[:REVIEWED {summary:'Silly, but fun', rating:65}]->(TheReplacements),\n", - "(JamesThompson)-[:REVIEWED {summary:'The coolest football movie ever', rating:100}]->(TheReplacements),\n", - "(AngelaScope)-[:REVIEWED {summary:'Pretty funny at times', rating:62}]->(TheReplacements),\n", - "(JessicaThompson)-[:REVIEWED {summary:'Dark, but compelling', rating:85}]->(Unforgiven),\n", - "(JessicaThompson)-[:REVIEWED {summary:\"Slapstick redeemed only by the Robin Williams and Gene Hackman's stellar performances\", rating:45}]->(TheBirdcage),\n", - "(JessicaThompson)-[:REVIEWED {summary:'A solid romp', rating:68}]->(TheDaVinciCode),\n", - "(JamesThompson)-[:REVIEWED {summary:'Fun, but a little far fetched', rating:65}]->(TheDaVinciCode),\n", - "(JessicaThompson)-[:REVIEWED {summary:'You had me at Jerry', rating:92}]->(JerryMaguire)\n", - "\n", - "WITH TomH as a\n", - "MATCH (a)-[:ACTED_IN]->(m)<-[:DIRECTED]-(d) RETURN a,m,d LIMIT 10;\n", - "```\n", - "\n", - "
\n", - "\n", - "This will create the following graph model\n", - "\n", - "" - ], - "metadata": { - "id": "AQhqv93Mj0Ss" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher -u $neo4j_url -us $neo4j_user -pw $neo4j_password\n", - "// the following Cypher query is the same as above\n", - "// and is required for running the notebook\n", - "CREATE (TheMatrix:Movie {title:'The Matrix', released:1999, tagline:'Welcome to the Real World'})\n", - "CREATE (Keanu:Person {name:'Keanu Reeves', born:1964})\n", - "CREATE (Carrie:Person {name:'Carrie-Anne Moss', born:1967})\n", - "CREATE (Laurence:Person {name:'Laurence Fishburne', born:1961})\n", - "CREATE (Hugo:Person {name:'Hugo Weaving', born:1960})\n", - "CREATE (LillyW:Person {name:'Lilly Wachowski', born:1967})\n", - "CREATE (LanaW:Person {name:'Lana Wachowski', born:1965})\n", - "CREATE (JoelS:Person {name:'Joel Silver', born:1952})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrix),\n", - "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrix),\n", - "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrix),\n", - "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrix),\n", - "(LillyW)-[:DIRECTED]->(TheMatrix),\n", - "(LanaW)-[:DIRECTED]->(TheMatrix),\n", - "(JoelS)-[:PRODUCED]->(TheMatrix)\n", - "\n", - "CREATE (Emil:Person {name:\"Emil Eifrem\", born:1978})\n", - "CREATE (Emil)-[:ACTED_IN {roles:[\"Emil\"]}]->(TheMatrix)\n", - "\n", - "CREATE (TheMatrixReloaded:Movie {title:'The Matrix Reloaded', released:2003, tagline:'Free your mind'})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrixReloaded),\n", - "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrixReloaded),\n", - "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrixReloaded),\n", - "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrixReloaded),\n", - "(LillyW)-[:DIRECTED]->(TheMatrixReloaded),\n", - "(LanaW)-[:DIRECTED]->(TheMatrixReloaded),\n", - "(JoelS)-[:PRODUCED]->(TheMatrixReloaded)\n", - "\n", - "CREATE (TheMatrixRevolutions:Movie {title:'The Matrix Revolutions', released:2003, tagline:'Everything that has a beginning has an end'})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrixRevolutions),\n", - "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrixRevolutions),\n", - "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrixRevolutions),\n", - "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrixRevolutions),\n", - "(LillyW)-[:DIRECTED]->(TheMatrixRevolutions),\n", - "(LanaW)-[:DIRECTED]->(TheMatrixRevolutions),\n", - "(JoelS)-[:PRODUCED]->(TheMatrixRevolutions)\n", - "\n", - "CREATE (TheDevilsAdvocate:Movie {title:\"The Devil's Advocate\", released:1997, tagline:'Evil has its winning ways'})\n", - "CREATE (Charlize:Person {name:'Charlize Theron', born:1975})\n", - "CREATE (Al:Person {name:'Al Pacino', born:1940})\n", - "CREATE (Taylor:Person {name:'Taylor Hackford', born:1944})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Kevin Lomax']}]->(TheDevilsAdvocate),\n", - "(Charlize)-[:ACTED_IN {roles:['Mary Ann Lomax']}]->(TheDevilsAdvocate),\n", - "(Al)-[:ACTED_IN {roles:['John Milton']}]->(TheDevilsAdvocate),\n", - "(Taylor)-[:DIRECTED]->(TheDevilsAdvocate)\n", - "\n", - "CREATE (AFewGoodMen:Movie {title:\"A Few Good Men\", released:1992, tagline:\"In the heart of the nation's capital, in a courthouse of the U.S. government, one man will stop at nothing to keep his honor, and one will stop at nothing to find the truth.\"})\n", - "CREATE (TomC:Person {name:'Tom Cruise', born:1962})\n", - "CREATE (JackN:Person {name:'Jack Nicholson', born:1937})\n", - "CREATE (DemiM:Person {name:'Demi Moore', born:1962})\n", - "CREATE (KevinB:Person {name:'Kevin Bacon', born:1958})\n", - "CREATE (KieferS:Person {name:'Kiefer Sutherland', born:1966})\n", - "CREATE (NoahW:Person {name:'Noah Wyle', born:1971})\n", - "CREATE (CubaG:Person {name:'Cuba Gooding Jr.', born:1968})\n", - "CREATE (KevinP:Person {name:'Kevin Pollak', born:1957})\n", - "CREATE (JTW:Person {name:'J.T. Walsh', born:1943})\n", - "CREATE (JamesM:Person {name:'James Marshall', born:1967})\n", - "CREATE (ChristopherG:Person {name:'Christopher Guest', born:1948})\n", - "CREATE (RobR:Person {name:'Rob Reiner', born:1947})\n", - "CREATE (AaronS:Person {name:'Aaron Sorkin', born:1961})\n", - "CREATE\n", - "(TomC)-[:ACTED_IN {roles:['Lt. Daniel Kaffee']}]->(AFewGoodMen),\n", - "(JackN)-[:ACTED_IN {roles:['Col. Nathan R. Jessup']}]->(AFewGoodMen),\n", - "(DemiM)-[:ACTED_IN {roles:['Lt. Cdr. JoAnne Galloway']}]->(AFewGoodMen),\n", - "(KevinB)-[:ACTED_IN {roles:['Capt. Jack Ross']}]->(AFewGoodMen),\n", - "(KieferS)-[:ACTED_IN {roles:['Lt. Jonathan Kendrick']}]->(AFewGoodMen),\n", - "(NoahW)-[:ACTED_IN {roles:['Cpl. Jeffrey Barnes']}]->(AFewGoodMen),\n", - "(CubaG)-[:ACTED_IN {roles:['Cpl. Carl Hammaker']}]->(AFewGoodMen),\n", - "(KevinP)-[:ACTED_IN {roles:['Lt. Sam Weinberg']}]->(AFewGoodMen),\n", - "(JTW)-[:ACTED_IN {roles:['Lt. Col. Matthew Andrew Markinson']}]->(AFewGoodMen),\n", - "(JamesM)-[:ACTED_IN {roles:['Pfc. Louden Downey']}]->(AFewGoodMen),\n", - "(ChristopherG)-[:ACTED_IN {roles:['Dr. Stone']}]->(AFewGoodMen),\n", - "(AaronS)-[:ACTED_IN {roles:['Man in Bar']}]->(AFewGoodMen),\n", - "(RobR)-[:DIRECTED]->(AFewGoodMen),\n", - "(AaronS)-[:WROTE]->(AFewGoodMen)\n", - "\n", - "CREATE (TopGun:Movie {title:\"Top Gun\", released:1986, tagline:'I feel the need, the need for speed.'})\n", - "CREATE (KellyM:Person {name:'Kelly McGillis', born:1957})\n", - "CREATE (ValK:Person {name:'Val Kilmer', born:1959})\n", - "CREATE (AnthonyE:Person {name:'Anthony Edwards', born:1962})\n", - "CREATE (TomS:Person {name:'Tom Skerritt', born:1933})\n", - "CREATE (MegR:Person {name:'Meg Ryan', born:1961})\n", - "CREATE (TonyS:Person {name:'Tony Scott', born:1944})\n", - "CREATE (JimC:Person {name:'Jim Cash', born:1941})\n", - "CREATE\n", - "(TomC)-[:ACTED_IN {roles:['Maverick']}]->(TopGun),\n", - "(KellyM)-[:ACTED_IN {roles:['Charlie']}]->(TopGun),\n", - "(ValK)-[:ACTED_IN {roles:['Iceman']}]->(TopGun),\n", - "(AnthonyE)-[:ACTED_IN {roles:['Goose']}]->(TopGun),\n", - "(TomS)-[:ACTED_IN {roles:['Viper']}]->(TopGun),\n", - "(MegR)-[:ACTED_IN {roles:['Carole']}]->(TopGun),\n", - "(TonyS)-[:DIRECTED]->(TopGun),\n", - "(JimC)-[:WROTE]->(TopGun)\n", - "\n", - "CREATE (JerryMaguire:Movie {title:'Jerry Maguire', released:2000, tagline:'The rest of his life begins now.'})\n", - "CREATE (ReneeZ:Person {name:'Renee Zellweger', born:1969})\n", - "CREATE (KellyP:Person {name:'Kelly Preston', born:1962})\n", - "CREATE (JerryO:Person {name:\"Jerry O'Connell\", born:1974})\n", - "CREATE (JayM:Person {name:'Jay Mohr', born:1970})\n", - "CREATE (BonnieH:Person {name:'Bonnie Hunt', born:1961})\n", - "CREATE (ReginaK:Person {name:'Regina King', born:1971})\n", - "CREATE (JonathanL:Person {name:'Jonathan Lipnicki', born:1996})\n", - "CREATE (CameronC:Person {name:'Cameron Crowe', born:1957})\n", - "CREATE\n", - "(TomC)-[:ACTED_IN {roles:['Jerry Maguire']}]->(JerryMaguire),\n", - "(CubaG)-[:ACTED_IN {roles:['Rod Tidwell']}]->(JerryMaguire),\n", - "(ReneeZ)-[:ACTED_IN {roles:['Dorothy Boyd']}]->(JerryMaguire),\n", - "(KellyP)-[:ACTED_IN {roles:['Avery Bishop']}]->(JerryMaguire),\n", - "(JerryO)-[:ACTED_IN {roles:['Frank Cushman']}]->(JerryMaguire),\n", - "(JayM)-[:ACTED_IN {roles:['Bob Sugar']}]->(JerryMaguire),\n", - "(BonnieH)-[:ACTED_IN {roles:['Laurel Boyd']}]->(JerryMaguire),\n", - "(ReginaK)-[:ACTED_IN {roles:['Marcee Tidwell']}]->(JerryMaguire),\n", - "(JonathanL)-[:ACTED_IN {roles:['Ray Boyd']}]->(JerryMaguire),\n", - "(CameronC)-[:DIRECTED]->(JerryMaguire),\n", - "(CameronC)-[:PRODUCED]->(JerryMaguire),\n", - "(CameronC)-[:WROTE]->(JerryMaguire)\n", - "\n", - "CREATE (StandByMe:Movie {title:\"Stand By Me\", released:1986, tagline:\"For some, it's the last real taste of innocence, and the first real taste of life. But for everyone, it's the time that memories are made of.\"})\n", - "CREATE (RiverP:Person {name:'River Phoenix', born:1970})\n", - "CREATE (CoreyF:Person {name:'Corey Feldman', born:1971})\n", - "CREATE (WilW:Person {name:'Wil Wheaton', born:1972})\n", - "CREATE (JohnC:Person {name:'John Cusack', born:1966})\n", - "CREATE (MarshallB:Person {name:'Marshall Bell', born:1942})\n", - "CREATE\n", - "(WilW)-[:ACTED_IN {roles:['Gordie Lachance']}]->(StandByMe),\n", - "(RiverP)-[:ACTED_IN {roles:['Chris Chambers']}]->(StandByMe),\n", - "(JerryO)-[:ACTED_IN {roles:['Vern Tessio']}]->(StandByMe),\n", - "(CoreyF)-[:ACTED_IN {roles:['Teddy Duchamp']}]->(StandByMe),\n", - "(JohnC)-[:ACTED_IN {roles:['Denny Lachance']}]->(StandByMe),\n", - "(KieferS)-[:ACTED_IN {roles:['Ace Merrill']}]->(StandByMe),\n", - "(MarshallB)-[:ACTED_IN {roles:['Mr. Lachance']}]->(StandByMe),\n", - "(RobR)-[:DIRECTED]->(StandByMe)\n", - "\n", - "CREATE (AsGoodAsItGets:Movie {title:'As Good as It Gets', released:1997, tagline:'A comedy from the heart that goes for the throat.'})\n", - "CREATE (HelenH:Person {name:'Helen Hunt', born:1963})\n", - "CREATE (GregK:Person {name:'Greg Kinnear', born:1963})\n", - "CREATE (JamesB:Person {name:'James L. Brooks', born:1940})\n", - "CREATE\n", - "(JackN)-[:ACTED_IN {roles:['Melvin Udall']}]->(AsGoodAsItGets),\n", - "(HelenH)-[:ACTED_IN {roles:['Carol Connelly']}]->(AsGoodAsItGets),\n", - "(GregK)-[:ACTED_IN {roles:['Simon Bishop']}]->(AsGoodAsItGets),\n", - "(CubaG)-[:ACTED_IN {roles:['Frank Sachs']}]->(AsGoodAsItGets),\n", - "(JamesB)-[:DIRECTED]->(AsGoodAsItGets)\n", - "\n", - "CREATE (WhatDreamsMayCome:Movie {title:'What Dreams May Come', released:1998, tagline:'After life there is more. The end is just the beginning.'})\n", - "CREATE (AnnabellaS:Person {name:'Annabella Sciorra', born:1960})\n", - "CREATE (MaxS:Person {name:'Max von Sydow', born:1929})\n", - "CREATE (WernerH:Person {name:'Werner Herzog', born:1942})\n", - "CREATE (Robin:Person {name:'Robin Williams', born:1951})\n", - "CREATE (VincentW:Person {name:'Vincent Ward', born:1956})\n", - "CREATE\n", - "(Robin)-[:ACTED_IN {roles:['Chris Nielsen']}]->(WhatDreamsMayCome),\n", - "(CubaG)-[:ACTED_IN {roles:['Albert Lewis']}]->(WhatDreamsMayCome),\n", - "(AnnabellaS)-[:ACTED_IN {roles:['Annie Collins-Nielsen']}]->(WhatDreamsMayCome),\n", - "(MaxS)-[:ACTED_IN {roles:['The Tracker']}]->(WhatDreamsMayCome),\n", - "(WernerH)-[:ACTED_IN {roles:['The Face']}]->(WhatDreamsMayCome),\n", - "(VincentW)-[:DIRECTED]->(WhatDreamsMayCome)\n", - "\n", - "CREATE (SnowFallingonCedars:Movie {title:'Snow Falling on Cedars', released:1999, tagline:'First loves last. Forever.'})\n", - "CREATE (EthanH:Person {name:'Ethan Hawke', born:1970})\n", - "CREATE (RickY:Person {name:'Rick Yune', born:1971})\n", - "CREATE (JamesC:Person {name:'James Cromwell', born:1940})\n", - "CREATE (ScottH:Person {name:'Scott Hicks', born:1953})\n", - "CREATE\n", - "(EthanH)-[:ACTED_IN {roles:['Ishmael Chambers']}]->(SnowFallingonCedars),\n", - "(RickY)-[:ACTED_IN {roles:['Kazuo Miyamoto']}]->(SnowFallingonCedars),\n", - "(MaxS)-[:ACTED_IN {roles:['Nels Gudmundsson']}]->(SnowFallingonCedars),\n", - "(JamesC)-[:ACTED_IN {roles:['Judge Fielding']}]->(SnowFallingonCedars),\n", - "(ScottH)-[:DIRECTED]->(SnowFallingonCedars)\n", - "\n", - "CREATE (YouveGotMail:Movie {title:\"You've Got Mail\", released:1998, tagline:'At odds in life... in love on-line.'})\n", - "CREATE (ParkerP:Person {name:'Parker Posey', born:1968})\n", - "CREATE (DaveC:Person {name:'Dave Chappelle', born:1973})\n", - "CREATE (SteveZ:Person {name:'Steve Zahn', born:1967})\n", - "CREATE (TomH:Person {name:'Tom Hanks', born:1956})\n", - "CREATE (NoraE:Person {name:'Nora Ephron', born:1941})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Joe Fox']}]->(YouveGotMail),\n", - "(MegR)-[:ACTED_IN {roles:['Kathleen Kelly']}]->(YouveGotMail),\n", - "(GregK)-[:ACTED_IN {roles:['Frank Navasky']}]->(YouveGotMail),\n", - "(ParkerP)-[:ACTED_IN {roles:['Patricia Eden']}]->(YouveGotMail),\n", - "(DaveC)-[:ACTED_IN {roles:['Kevin Jackson']}]->(YouveGotMail),\n", - "(SteveZ)-[:ACTED_IN {roles:['George Pappas']}]->(YouveGotMail),\n", - "(NoraE)-[:DIRECTED]->(YouveGotMail)\n", - "\n", - "CREATE (SleeplessInSeattle:Movie {title:'Sleepless in Seattle', released:1993, tagline:'What if someone you never met, someone you never saw, someone you never knew was the only someone for you?'})\n", - "CREATE (RitaW:Person {name:'Rita Wilson', born:1956})\n", - "CREATE (BillPull:Person {name:'Bill Pullman', born:1953})\n", - "CREATE (VictorG:Person {name:'Victor Garber', born:1949})\n", - "CREATE (RosieO:Person {name:\"Rosie O'Donnell\", born:1962})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Sam Baldwin']}]->(SleeplessInSeattle),\n", - "(MegR)-[:ACTED_IN {roles:['Annie Reed']}]->(SleeplessInSeattle),\n", - "(RitaW)-[:ACTED_IN {roles:['Suzy']}]->(SleeplessInSeattle),\n", - "(BillPull)-[:ACTED_IN {roles:['Walter']}]->(SleeplessInSeattle),\n", - "(VictorG)-[:ACTED_IN {roles:['Greg']}]->(SleeplessInSeattle),\n", - "(RosieO)-[:ACTED_IN {roles:['Becky']}]->(SleeplessInSeattle),\n", - "(NoraE)-[:DIRECTED]->(SleeplessInSeattle)\n", - "\n", - "CREATE (JoeVersustheVolcano:Movie {title:'Joe Versus the Volcano', released:1990, tagline:'A story of love, lava and burning desire.'})\n", - "CREATE (JohnS:Person {name:'John Patrick Stanley', born:1950})\n", - "CREATE (Nathan:Person {name:'Nathan Lane', born:1956})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Joe Banks']}]->(JoeVersustheVolcano),\n", - "(MegR)-[:ACTED_IN {roles:['DeDe', 'Angelica Graynamore', 'Patricia Graynamore']}]->(JoeVersustheVolcano),\n", - "(Nathan)-[:ACTED_IN {roles:['Baw']}]->(JoeVersustheVolcano),\n", - "(JohnS)-[:DIRECTED]->(JoeVersustheVolcano)\n", - "\n", - "CREATE (WhenHarryMetSally:Movie {title:'When Harry Met Sally', released:1998, tagline:'Can two friends sleep together and still love each other in the morning?'})\n", - "CREATE (BillyC:Person {name:'Billy Crystal', born:1948})\n", - "CREATE (CarrieF:Person {name:'Carrie Fisher', born:1956})\n", - "CREATE (BrunoK:Person {name:'Bruno Kirby', born:1949})\n", - "CREATE\n", - "(BillyC)-[:ACTED_IN {roles:['Harry Burns']}]->(WhenHarryMetSally),\n", - "(MegR)-[:ACTED_IN {roles:['Sally Albright']}]->(WhenHarryMetSally),\n", - "(CarrieF)-[:ACTED_IN {roles:['Marie']}]->(WhenHarryMetSally),\n", - "(BrunoK)-[:ACTED_IN {roles:['Jess']}]->(WhenHarryMetSally),\n", - "(RobR)-[:DIRECTED]->(WhenHarryMetSally),\n", - "(RobR)-[:PRODUCED]->(WhenHarryMetSally),\n", - "(NoraE)-[:PRODUCED]->(WhenHarryMetSally),\n", - "(NoraE)-[:WROTE]->(WhenHarryMetSally)\n", - "\n", - "CREATE (ThatThingYouDo:Movie {title:'That Thing You Do', released:1996, tagline:'In every life there comes a time when that thing you dream becomes that thing you do'})\n", - "CREATE (LivT:Person {name:'Liv Tyler', born:1977})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Mr. White']}]->(ThatThingYouDo),\n", - "(LivT)-[:ACTED_IN {roles:['Faye Dolan']}]->(ThatThingYouDo),\n", - "(Charlize)-[:ACTED_IN {roles:['Tina']}]->(ThatThingYouDo),\n", - "(TomH)-[:DIRECTED]->(ThatThingYouDo)\n", - "\n", - "CREATE (TheReplacements:Movie {title:'The Replacements', released:2000, tagline:'Pain heals, Chicks dig scars... Glory lasts forever'})\n", - "CREATE (Brooke:Person {name:'Brooke Langton', born:1970})\n", - "CREATE (Gene:Person {name:'Gene Hackman', born:1930})\n", - "CREATE (Orlando:Person {name:'Orlando Jones', born:1968})\n", - "CREATE (Howard:Person {name:'Howard Deutch', born:1950})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Shane Falco']}]->(TheReplacements),\n", - "(Brooke)-[:ACTED_IN {roles:['Annabelle Farrell']}]->(TheReplacements),\n", - "(Gene)-[:ACTED_IN {roles:['Jimmy McGinty']}]->(TheReplacements),\n", - "(Orlando)-[:ACTED_IN {roles:['Clifford Franklin']}]->(TheReplacements),\n", - "(Howard)-[:DIRECTED]->(TheReplacements)\n", - "\n", - "CREATE (RescueDawn:Movie {title:'RescueDawn', released:2006, tagline:\"Based on the extraordinary true story of one man's fight for freedom\"})\n", - "CREATE (ChristianB:Person {name:'Christian Bale', born:1974})\n", - "CREATE (ZachG:Person {name:'Zach Grenier', born:1954})\n", - "CREATE\n", - "(MarshallB)-[:ACTED_IN {roles:['Admiral']}]->(RescueDawn),\n", - "(ChristianB)-[:ACTED_IN {roles:['Dieter Dengler']}]->(RescueDawn),\n", - "(ZachG)-[:ACTED_IN {roles:['Squad Leader']}]->(RescueDawn),\n", - "(SteveZ)-[:ACTED_IN {roles:['Duane']}]->(RescueDawn),\n", - "(WernerH)-[:DIRECTED]->(RescueDawn)\n", - "\n", - "CREATE (TheBirdcage:Movie {title:'The Birdcage', released:1996, tagline:'Come as you are'})\n", - "CREATE (MikeN:Person {name:'Mike Nichols', born:1931})\n", - "CREATE\n", - "(Robin)-[:ACTED_IN {roles:['Armand Goldman']}]->(TheBirdcage),\n", - "(Nathan)-[:ACTED_IN {roles:['Albert Goldman']}]->(TheBirdcage),\n", - "(Gene)-[:ACTED_IN {roles:['Sen. Kevin Keeley']}]->(TheBirdcage),\n", - "(MikeN)-[:DIRECTED]->(TheBirdcage)\n", - "\n", - "CREATE (Unforgiven:Movie {title:'Unforgiven', released:1992, tagline:\"It's a hell of a thing, killing a man\"})\n", - "CREATE (RichardH:Person {name:'Richard Harris', born:1930})\n", - "CREATE (ClintE:Person {name:'Clint Eastwood', born:1930})\n", - "CREATE\n", - "(RichardH)-[:ACTED_IN {roles:['English Bob']}]->(Unforgiven),\n", - "(ClintE)-[:ACTED_IN {roles:['Bill Munny']}]->(Unforgiven),\n", - "(Gene)-[:ACTED_IN {roles:['Little Bill Daggett']}]->(Unforgiven),\n", - "(ClintE)-[:DIRECTED]->(Unforgiven)\n", - "\n", - "CREATE (JohnnyMnemonic:Movie {title:'Johnny Mnemonic', released:1995, tagline:'The hottest data on earth. In the coolest head in town'})\n", - "CREATE (Takeshi:Person {name:'Takeshi Kitano', born:1947})\n", - "CREATE (Dina:Person {name:'Dina Meyer', born:1968})\n", - "CREATE (IceT:Person {name:'Ice-T', born:1958})\n", - "CREATE (RobertL:Person {name:'Robert Longo', born:1953})\n", - "CREATE\n", - "(Keanu)-[:ACTED_IN {roles:['Johnny Mnemonic']}]->(JohnnyMnemonic),\n", - "(Takeshi)-[:ACTED_IN {roles:['Takahashi']}]->(JohnnyMnemonic),\n", - "(Dina)-[:ACTED_IN {roles:['Jane']}]->(JohnnyMnemonic),\n", - "(IceT)-[:ACTED_IN {roles:['J-Bone']}]->(JohnnyMnemonic),\n", - "(RobertL)-[:DIRECTED]->(JohnnyMnemonic)\n", - "\n", - "CREATE (CloudAtlas:Movie {title:'Cloud Atlas', released:2012, tagline:'Everything is connected'})\n", - "CREATE (HalleB:Person {name:'Halle Berry', born:1966})\n", - "CREATE (JimB:Person {name:'Jim Broadbent', born:1949})\n", - "CREATE (TomT:Person {name:'Tom Tykwer', born:1965})\n", - "CREATE (DavidMitchell:Person {name:'David Mitchell', born:1969})\n", - "CREATE (StefanArndt:Person {name:'Stefan Arndt', born:1961})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Zachry', 'Dr. Henry Goose', 'Isaac Sachs', 'Dermot Hoggins']}]->(CloudAtlas),\n", - "(Hugo)-[:ACTED_IN {roles:['Bill Smoke', 'Haskell Moore', 'Tadeusz Kesselring', 'Nurse Noakes', 'Boardman Mephi', 'Old Georgie']}]->(CloudAtlas),\n", - "(HalleB)-[:ACTED_IN {roles:['Luisa Rey', 'Jocasta Ayrs', 'Ovid', 'Meronym']}]->(CloudAtlas),\n", - "(JimB)-[:ACTED_IN {roles:['Vyvyan Ayrs', 'Captain Molyneux', 'Timothy Cavendish']}]->(CloudAtlas),\n", - "(TomT)-[:DIRECTED]->(CloudAtlas),\n", - "(LillyW)-[:DIRECTED]->(CloudAtlas),\n", - "(LanaW)-[:DIRECTED]->(CloudAtlas),\n", - "(DavidMitchell)-[:WROTE]->(CloudAtlas),\n", - "(StefanArndt)-[:PRODUCED]->(CloudAtlas)\n", - "\n", - "CREATE (TheDaVinciCode:Movie {title:'The Da Vinci Code', released:2006, tagline:'Break The Codes'})\n", - "CREATE (IanM:Person {name:'Ian McKellen', born:1939})\n", - "CREATE (AudreyT:Person {name:'Audrey Tautou', born:1976})\n", - "CREATE (PaulB:Person {name:'Paul Bettany', born:1971})\n", - "CREATE (RonH:Person {name:'Ron Howard', born:1954})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Dr. Robert Langdon']}]->(TheDaVinciCode),\n", - "(IanM)-[:ACTED_IN {roles:['Sir Leight Teabing']}]->(TheDaVinciCode),\n", - "(AudreyT)-[:ACTED_IN {roles:['Sophie Neveu']}]->(TheDaVinciCode),\n", - "(PaulB)-[:ACTED_IN {roles:['Silas']}]->(TheDaVinciCode),\n", - "(RonH)-[:DIRECTED]->(TheDaVinciCode)\n", - "\n", - "CREATE (VforVendetta:Movie {title:'V for Vendetta', released:2006, tagline:'Freedom! Forever!'})\n", - "CREATE (NatalieP:Person {name:'Natalie Portman', born:1981})\n", - "CREATE (StephenR:Person {name:'Stephen Rea', born:1946})\n", - "CREATE (JohnH:Person {name:'John Hurt', born:1940})\n", - "CREATE (BenM:Person {name: 'Ben Miles', born:1967})\n", - "CREATE\n", - "(Hugo)-[:ACTED_IN {roles:['V']}]->(VforVendetta),\n", - "(NatalieP)-[:ACTED_IN {roles:['Evey Hammond']}]->(VforVendetta),\n", - "(StephenR)-[:ACTED_IN {roles:['Eric Finch']}]->(VforVendetta),\n", - "(JohnH)-[:ACTED_IN {roles:['High Chancellor Adam Sutler']}]->(VforVendetta),\n", - "(BenM)-[:ACTED_IN {roles:['Dascomb']}]->(VforVendetta),\n", - "(JamesM)-[:DIRECTED]->(VforVendetta),\n", - "(LillyW)-[:PRODUCED]->(VforVendetta),\n", - "(LanaW)-[:PRODUCED]->(VforVendetta),\n", - "(JoelS)-[:PRODUCED]->(VforVendetta),\n", - "(LillyW)-[:WROTE]->(VforVendetta),\n", - "(LanaW)-[:WROTE]->(VforVendetta)\n", - "\n", - "CREATE (SpeedRacer:Movie {title:'Speed Racer', released:2008, tagline:'Speed has no limits'})\n", - "CREATE (EmileH:Person {name:'Emile Hirsch', born:1985})\n", - "CREATE (JohnG:Person {name:'John Goodman', born:1960})\n", - "CREATE (SusanS:Person {name:'Susan Sarandon', born:1946})\n", - "CREATE (MatthewF:Person {name:'Matthew Fox', born:1966})\n", - "CREATE (ChristinaR:Person {name:'Christina Ricci', born:1980})\n", - "CREATE (Rain:Person {name:'Rain', born:1982})\n", - "CREATE\n", - "(EmileH)-[:ACTED_IN {roles:['Speed Racer']}]->(SpeedRacer),\n", - "(JohnG)-[:ACTED_IN {roles:['Pops']}]->(SpeedRacer),\n", - "(SusanS)-[:ACTED_IN {roles:['Mom']}]->(SpeedRacer),\n", - "(MatthewF)-[:ACTED_IN {roles:['Racer X']}]->(SpeedRacer),\n", - "(ChristinaR)-[:ACTED_IN {roles:['Trixie']}]->(SpeedRacer),\n", - "(Rain)-[:ACTED_IN {roles:['Taejo Togokahn']}]->(SpeedRacer),\n", - "(BenM)-[:ACTED_IN {roles:['Cass Jones']}]->(SpeedRacer),\n", - "(LillyW)-[:DIRECTED]->(SpeedRacer),\n", - "(LanaW)-[:DIRECTED]->(SpeedRacer),\n", - "(LillyW)-[:WROTE]->(SpeedRacer),\n", - "(LanaW)-[:WROTE]->(SpeedRacer),\n", - "(JoelS)-[:PRODUCED]->(SpeedRacer)\n", - "\n", - "CREATE (NinjaAssassin:Movie {title:'Ninja Assassin', released:2009, tagline:'Prepare to enter a secret world of assassins'})\n", - "CREATE (NaomieH:Person {name:'Naomie Harris'})\n", - "CREATE\n", - "(Rain)-[:ACTED_IN {roles:['Raizo']}]->(NinjaAssassin),\n", - "(NaomieH)-[:ACTED_IN {roles:['Mika Coretti']}]->(NinjaAssassin),\n", - "(RickY)-[:ACTED_IN {roles:['Takeshi']}]->(NinjaAssassin),\n", - "(BenM)-[:ACTED_IN {roles:['Ryan Maslow']}]->(NinjaAssassin),\n", - "(JamesM)-[:DIRECTED]->(NinjaAssassin),\n", - "(LillyW)-[:PRODUCED]->(NinjaAssassin),\n", - "(LanaW)-[:PRODUCED]->(NinjaAssassin),\n", - "(JoelS)-[:PRODUCED]->(NinjaAssassin)\n", - "\n", - "CREATE (TheGreenMile:Movie {title:'The Green Mile', released:1999, tagline:\"Walk a mile you'll never forget.\"})\n", - "CREATE (MichaelD:Person {name:'Michael Clarke Duncan', born:1957})\n", - "CREATE (DavidM:Person {name:'David Morse', born:1953})\n", - "CREATE (SamR:Person {name:'Sam Rockwell', born:1968})\n", - "CREATE (GaryS:Person {name:'Gary Sinise', born:1955})\n", - "CREATE (PatriciaC:Person {name:'Patricia Clarkson', born:1959})\n", - "CREATE (FrankD:Person {name:'Frank Darabont', born:1959})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Paul Edgecomb']}]->(TheGreenMile),\n", - "(MichaelD)-[:ACTED_IN {roles:['John Coffey']}]->(TheGreenMile),\n", - "(DavidM)-[:ACTED_IN {roles:['Brutus \"Brutal\" Howell']}]->(TheGreenMile),\n", - "(BonnieH)-[:ACTED_IN {roles:['Jan Edgecomb']}]->(TheGreenMile),\n", - "(JamesC)-[:ACTED_IN {roles:['Warden Hal Moores']}]->(TheGreenMile),\n", - "(SamR)-[:ACTED_IN {roles:['\"Wild Bill\" Wharton']}]->(TheGreenMile),\n", - "(GaryS)-[:ACTED_IN {roles:['Burt Hammersmith']}]->(TheGreenMile),\n", - "(PatriciaC)-[:ACTED_IN {roles:['Melinda Moores']}]->(TheGreenMile),\n", - "(FrankD)-[:DIRECTED]->(TheGreenMile)\n", - "\n", - "CREATE (FrostNixon:Movie {title:'Frost/Nixon', released:2008, tagline:'400 million people were waiting for the truth.'})\n", - "CREATE (FrankL:Person {name:'Frank Langella', born:1938})\n", - "CREATE (MichaelS:Person {name:'Michael Sheen', born:1969})\n", - "CREATE (OliverP:Person {name:'Oliver Platt', born:1960})\n", - "CREATE\n", - "(FrankL)-[:ACTED_IN {roles:['Richard Nixon']}]->(FrostNixon),\n", - "(MichaelS)-[:ACTED_IN {roles:['David Frost']}]->(FrostNixon),\n", - "(KevinB)-[:ACTED_IN {roles:['Jack Brennan']}]->(FrostNixon),\n", - "(OliverP)-[:ACTED_IN {roles:['Bob Zelnick']}]->(FrostNixon),\n", - "(SamR)-[:ACTED_IN {roles:['James Reston, Jr.']}]->(FrostNixon),\n", - "(RonH)-[:DIRECTED]->(FrostNixon)\n", - "\n", - "CREATE (Hoffa:Movie {title:'Hoffa', released:1992, tagline:\"He didn't want law. He wanted justice.\"})\n", - "CREATE (DannyD:Person {name:'Danny DeVito', born:1944})\n", - "CREATE (JohnR:Person {name:'John C. Reilly', born:1965})\n", - "CREATE\n", - "(JackN)-[:ACTED_IN {roles:['Hoffa']}]->(Hoffa),\n", - "(DannyD)-[:ACTED_IN {roles:['Robert \"Bobby\" Ciaro']}]->(Hoffa),\n", - "(JTW)-[:ACTED_IN {roles:['Frank Fitzsimmons']}]->(Hoffa),\n", - "(JohnR)-[:ACTED_IN {roles:['Peter \"Pete\" Connelly']}]->(Hoffa),\n", - "(DannyD)-[:DIRECTED]->(Hoffa)\n", - "\n", - "CREATE (Apollo13:Movie {title:'Apollo 13', released:1995, tagline:'Houston, we have a problem.'})\n", - "CREATE (EdH:Person {name:'Ed Harris', born:1950})\n", - "CREATE (BillPax:Person {name:'Bill Paxton', born:1955})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Jim Lovell']}]->(Apollo13),\n", - "(KevinB)-[:ACTED_IN {roles:['Jack Swigert']}]->(Apollo13),\n", - "(EdH)-[:ACTED_IN {roles:['Gene Kranz']}]->(Apollo13),\n", - "(BillPax)-[:ACTED_IN {roles:['Fred Haise']}]->(Apollo13),\n", - "(GaryS)-[:ACTED_IN {roles:['Ken Mattingly']}]->(Apollo13),\n", - "(RonH)-[:DIRECTED]->(Apollo13)\n", - "\n", - "CREATE (Twister:Movie {title:'Twister', released:1996, tagline:\"Don't Breathe. Don't Look Back.\"})\n", - "CREATE (PhilipH:Person {name:'Philip Seymour Hoffman', born:1967})\n", - "CREATE (JanB:Person {name:'Jan de Bont', born:1943})\n", - "CREATE\n", - "(BillPax)-[:ACTED_IN {roles:['Bill Harding']}]->(Twister),\n", - "(HelenH)-[:ACTED_IN {roles:['Dr. Jo Harding']}]->(Twister),\n", - "(ZachG)-[:ACTED_IN {roles:['Eddie']}]->(Twister),\n", - "(PhilipH)-[:ACTED_IN {roles:['Dustin \"Dusty\" Davis']}]->(Twister),\n", - "(JanB)-[:DIRECTED]->(Twister)\n", - "\n", - "CREATE (CastAway:Movie {title:'Cast Away', released:2000, tagline:'At the edge of the world, his journey begins.'})\n", - "CREATE (RobertZ:Person {name:'Robert Zemeckis', born:1951})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Chuck Noland']}]->(CastAway),\n", - "(HelenH)-[:ACTED_IN {roles:['Kelly Frears']}]->(CastAway),\n", - "(RobertZ)-[:DIRECTED]->(CastAway)\n", - "\n", - "CREATE (OneFlewOvertheCuckoosNest:Movie {title:\"One Flew Over the Cuckoo's Nest\", released:1975, tagline:\"If he's crazy, what does that make you?\"})\n", - "CREATE (MilosF:Person {name:'Milos Forman', born:1932})\n", - "CREATE\n", - "(JackN)-[:ACTED_IN {roles:['Randle McMurphy']}]->(OneFlewOvertheCuckoosNest),\n", - "(DannyD)-[:ACTED_IN {roles:['Martini']}]->(OneFlewOvertheCuckoosNest),\n", - "(MilosF)-[:DIRECTED]->(OneFlewOvertheCuckoosNest)\n", - "\n", - "CREATE (SomethingsGottaGive:Movie {title:\"Something's Gotta Give\", released:2003})\n", - "CREATE (DianeK:Person {name:'Diane Keaton', born:1946})\n", - "CREATE (NancyM:Person {name:'Nancy Meyers', born:1949})\n", - "CREATE\n", - "(JackN)-[:ACTED_IN {roles:['Harry Sanborn']}]->(SomethingsGottaGive),\n", - "(DianeK)-[:ACTED_IN {roles:['Erica Barry']}]->(SomethingsGottaGive),\n", - "(Keanu)-[:ACTED_IN {roles:['Julian Mercer']}]->(SomethingsGottaGive),\n", - "(NancyM)-[:DIRECTED]->(SomethingsGottaGive),\n", - "(NancyM)-[:PRODUCED]->(SomethingsGottaGive),\n", - "(NancyM)-[:WROTE]->(SomethingsGottaGive)\n", - "\n", - "CREATE (BicentennialMan:Movie {title:'Bicentennial Man', released:1999, tagline:\"One robot's 200 year journey to become an ordinary man.\"})\n", - "CREATE (ChrisC:Person {name:'Chris Columbus', born:1958})\n", - "CREATE\n", - "(Robin)-[:ACTED_IN {roles:['Andrew Marin']}]->(BicentennialMan),\n", - "(OliverP)-[:ACTED_IN {roles:['Rupert Burns']}]->(BicentennialMan),\n", - "(ChrisC)-[:DIRECTED]->(BicentennialMan)\n", - "\n", - "CREATE (CharlieWilsonsWar:Movie {title:\"Charlie Wilson's War\", released:2007, tagline:\"A stiff drink. A little mascara. A lot of nerve. Who said they couldn't bring down the Soviet empire.\"})\n", - "CREATE (JuliaR:Person {name:'Julia Roberts', born:1967})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Rep. Charlie Wilson']}]->(CharlieWilsonsWar),\n", - "(JuliaR)-[:ACTED_IN {roles:['Joanne Herring']}]->(CharlieWilsonsWar),\n", - "(PhilipH)-[:ACTED_IN {roles:['Gust Avrakotos']}]->(CharlieWilsonsWar),\n", - "(MikeN)-[:DIRECTED]->(CharlieWilsonsWar)\n", - "\n", - "CREATE (ThePolarExpress:Movie {title:'The Polar Express', released:2004, tagline:'This Holiday Season... Believe'})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Hero Boy', 'Father', 'Conductor', 'Hobo', 'Scrooge', 'Santa Claus']}]->(ThePolarExpress),\n", - "(RobertZ)-[:DIRECTED]->(ThePolarExpress)\n", - "\n", - "CREATE (ALeagueofTheirOwn:Movie {title:'A League of Their Own', released:1992, tagline:'Once in a lifetime you get a chance to do something different.'})\n", - "CREATE (Madonna:Person {name:'Madonna', born:1954})\n", - "CREATE (GeenaD:Person {name:'Geena Davis', born:1956})\n", - "CREATE (LoriP:Person {name:'Lori Petty', born:1963})\n", - "CREATE (PennyM:Person {name:'Penny Marshall', born:1943})\n", - "CREATE\n", - "(TomH)-[:ACTED_IN {roles:['Jimmy Dugan']}]->(ALeagueofTheirOwn),\n", - "(GeenaD)-[:ACTED_IN {roles:['Dottie Hinson']}]->(ALeagueofTheirOwn),\n", - "(LoriP)-[:ACTED_IN {roles:['Kit Keller']}]->(ALeagueofTheirOwn),\n", - "(RosieO)-[:ACTED_IN {roles:['Doris Murphy']}]->(ALeagueofTheirOwn),\n", - "(Madonna)-[:ACTED_IN {roles:['\"All the Way\" Mae Mordabito']}]->(ALeagueofTheirOwn),\n", - "(BillPax)-[:ACTED_IN {roles:['Bob Hinson']}]->(ALeagueofTheirOwn),\n", - "(PennyM)-[:DIRECTED]->(ALeagueofTheirOwn)\n", - "\n", - "CREATE (PaulBlythe:Person {name:'Paul Blythe'})\n", - "CREATE (AngelaScope:Person {name:'Angela Scope'})\n", - "CREATE (JessicaThompson:Person {name:'Jessica Thompson'})\n", - "CREATE (JamesThompson:Person {name:'James Thompson'})\n", - "\n", - "CREATE\n", - "(JamesThompson)-[:FOLLOWS]->(JessicaThompson),\n", - "(AngelaScope)-[:FOLLOWS]->(JessicaThompson),\n", - "(PaulBlythe)-[:FOLLOWS]->(AngelaScope)\n", - "\n", - "CREATE\n", - "(JessicaThompson)-[:REVIEWED {summary:'An amazing journey', rating:95}]->(CloudAtlas),\n", - "(JessicaThompson)-[:REVIEWED {summary:'Silly, but fun', rating:65}]->(TheReplacements),\n", - "(JamesThompson)-[:REVIEWED {summary:'The coolest football movie ever', rating:100}]->(TheReplacements),\n", - "(AngelaScope)-[:REVIEWED {summary:'Pretty funny at times', rating:62}]->(TheReplacements),\n", - "(JessicaThompson)-[:REVIEWED {summary:'Dark, but compelling', rating:85}]->(Unforgiven),\n", - "(JessicaThompson)-[:REVIEWED {summary:\"Slapstick redeemed only by the Robin Williams and Gene Hackman's stellar performances\", rating:45}]->(TheBirdcage),\n", - "(JessicaThompson)-[:REVIEWED {summary:'A solid romp', rating:68}]->(TheDaVinciCode),\n", - "(JamesThompson)-[:REVIEWED {summary:'Fun, but a little far fetched', rating:65}]->(TheDaVinciCode),\n", - "(JessicaThompson)-[:REVIEWED {summary:'You had me at Jerry', rating:92}]->(JerryMaguire)\n", - "\n", - "WITH TomH as a\n", - "MATCH (a)-[:ACTED_IN]->(m)<-[:DIRECTED]-(d) RETURN a,m,d LIMIT 10;" - ], - "metadata": { - "id": "QFbjo1k24YEY" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "You can query the database via **cy2py** in this simple way" - ], - "metadata": { - "id": "peqcEHj0b35T" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "CALL apoc.meta.graph()" - ], - "metadata": { - "id": "BfFOTNkncMqp" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "As you can see the model is exactely how we expect!" - ], - "metadata": { - "id": "sGu-zpk8nY5r" - } - }, - { - "cell_type": "code", - "source": [ - "# this step is MANDATORY for the exercises\n", - "from neo4j import GraphDatabase\n", - "neo4j_driver = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))" - ], - "metadata": { - "id": "_zZF1guo58cc" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "c8bQe1b-7RY-" - }, - "source": [ - "# Read data from Neo4j into Spark\n" - ] - }, - { - "cell_type": "markdown", - "source": [ - "The query above generates the following graph model:\n", - "\n" - ], - "metadata": { - "id": "ovoUnDmocaxK" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B1LLHYf1CsPh" - }, - "source": [ - "## Read nodes via `labels` option" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "omdSk6ShCqfA" - }, - "outputs": [], - "source": [ - "movies_df = (spark.read\n", - " .format('org.neo4j.spark.DataSource')\n", - " .option('labels', ':Movie')\n", - " .load())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RyglSgXnQcar" - }, - "source": [ - "### Schema description" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "f9AaUINjPH4n" - }, - "outputs": [], - "source": [ - "movies_df.printSchema()" - ] - }, - { - "cell_type": "markdown", - "source": [ - "The `movies_df` contains a set of fields, the first two (generally) are always:\n", - "\n", - "* `` which represents the internal Neo4j id\n", - "* `` which represents the list of labels attached to the node\n", - "\n", - "All other properties are taken from the node via schema resolution by using APOC or Cypher queries" - ], - "metadata": { - "id": "jxLcYSkgZ1xf" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "omGjaw5QDgS-" - }, - "outputs": [], - "source": [ - "movies_df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7-KTRC5HD5sO" - }, - "source": [ - "### Exercise\n", - "\n", - "Read all the `Person` nodes store them into a Python variable called `person_df` and then verify the results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZhnsFC9KEsLp" - }, - "outputs": [], - "source": [ - "person_df = # write your spark code here" - ] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "person_df = (spark.read\n", - " .format('org.neo4j.spark.DataSource')\n", - " .option('labels', ':Person')\n", - " .load())\n", - "```\n", - "\n", - "
\n", - "\n" - ], - "metadata": { - "id": "O4WEzidAZBh-" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "D5_zXyweE-QM" - }, - "outputs": [], - "source": [ - "\"\"\"\n", - " This paragraph is for validating the code the you\n", - " wrote above, please execute it after you\n", - " created the person_df\n", - "\"\"\"\n", - "\n", - "assert person_df.count() == 133\n", - "assert person_df.schema.fieldNames() == ['', '', 'name', 'born']\n", - "assert person_df.collect()[0][\"\"] == ['Person']\n", - "print(\"All assertion are successfuly satisfied. Congrats you created your first DataFrame\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m1hgGMLCRoZx" - }, - "source": [ - "## Read relationships via `relationship` option" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HgPockV0I5Q3" - }, - "source": [ - "There are two way to transform relationships into DataFrame\n", - "\n", - "* having all the node and relationship data flattened into the DataFrame\n", - "* having all the node properties in maps and the relationship data as columns" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "m0DBqZLtKtvX" - }, - "source": [ - "### DataFrame with flattened data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "796cuMwXR2zi" - }, - "outputs": [], - "source": [ - "actedin_df = (spark.read\n", - " .format('org.neo4j.spark.DataSource')\n", - " .option('relationship', 'ACTED_IN')\n", - " .option('relationship.source.labels', ':Person')\n", - " .option('relationship.target.labels', ':Movie')\n", - " .load())" - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Schema description" - ], - "metadata": { - "id": "yzyviI5vXO4K" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5uDWZqoySNGc" - }, - "outputs": [], - "source": [ - "actedin_df.printSchema()" - ] - }, - { - "cell_type": "markdown", - "source": [ - "The `movies_df` contains a set of fields, the first two (generally) are always:\n", - "\n", - "* `` which represents the internal Neo4j relationship id\n", - "* `` which represents the relationship type\n", - "* `` which represents the internal Neo4j node id\n", - "* `` which represents the list of labels attached to the node\n", - "* `rel.*` which represents the properties attached to the relationship\n", - "* `source/target.*` which represents the properties attached to the node\n", - "\n", - "All other properties are taken from the node via schema resolution by using APOC or Cypher queries" - ], - "metadata": { - "id": "2dB9DL7KZxrX" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VPHDTL-IUX2X" - }, - "outputs": [], - "source": [ - "actedin_df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RoPVDptGKy_m" - }, - "source": [ - "### DataFrame with nodes as map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8VxgDlBXIt_h" - }, - "outputs": [], - "source": [ - "actedin_map_df = (spark.read\n", - " .format('org.neo4j.spark.DataSource')\n", - " .option('relationship.nodes.map', True)\n", - " .option('relationship', 'ACTED_IN')\n", - " .option('relationship.source.labels', ':Person')\n", - " .option('relationship.target.labels', ':Movie')\n", - " .load())" - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Schema description" - ], - "metadata": { - "id": "nbGAYcd7YMOp" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gF5-m4BbI2Ib" - }, - "outputs": [], - "source": [ - "actedin_map_df.printSchema()" - ] - }, - { - "cell_type": "markdown", - "source": [ - "The `movies_df` contains a set of fields, the first two (generally) are always:\n", - "\n", - "* `` which represents the internal Neo4j relationship id\n", - "* `` which represents the relationship type\n", - "* `` which represents a map with node values\n", - "* `rel.*` which represents the properties attached to the relationship\n", - "\n", - "All other properties are taken from the node via schema resolution by using APOC or Cypher queries" - ], - "metadata": { - "id": "Zuu42SpfZ502" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UUYUwE3CLA_r" - }, - "outputs": [], - "source": [ - "actedin_map_df" - ] - }, - { - "cell_type": "code", - "source": [ - "actedin_map_df.collect()[0][\"\"]" - ], - "metadata": { - "id": "hHPq7neyYDSx" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Exercise\n", - "\n", - "Read all the `DIRECTED` relationships" - ], - "metadata": { - "id": "Viop-9_thCbF" - } - }, - { - "cell_type": "code", - "source": [ - "directed_df = # write your spark code here" - ], - "metadata": { - "id": "j0tTsk59hhLh" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "directed_df = (spark.read\n", - " .format('org.neo4j.spark.DataSource')\n", - " .option('relationship', 'DIRECTED')\n", - " .option('relationship.source.labels', ':Person')\n", - " .option('relationship.target.labels', ':Movie')\n", - " .load())\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "VQYyYSMpj2lf" - } - }, - { - "cell_type": "code", - "source": [ - "\"\"\"\n", - " This paragraph is for validating the code the you\n", - " wrote above, please execute it after you\n", - " created the directed_df\n", - "\"\"\"\n", - "\n", - "assert directed_df.count() == 44\n", - "assert directed_df.schema.fieldNames() == ['',\n", - " '',\n", - " '',\n", - " '',\n", - " 'source.name',\n", - " 'source.born',\n", - " '',\n", - " '',\n", - " 'target.title',\n", - " 'target.tagline',\n", - " 'target.released']\n", - "assert directed_df.collect()[0][\"\"] == 'DIRECTED'\n", - "print(\"All assertion are successfuly satisfied. Congrats you created your first relationship DataFrame\")" - ], - "metadata": { - "id": "HRwaJ8PvhudP" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Read arbitrary data via Cypher query" - ], - "metadata": { - "id": "hCpzW904dS2r" - } - }, - { - "cell_type": "code", - "source": [ - "cypher_df = (spark.read\n", - " .format('org.neo4j.spark.DataSource')\n", - " .option('query', '''\n", - " // Extend Tom Hanks co-actors, to find co-co-actors who haven't worked with Tom Hanks\n", - " MATCH (tom:Person {name:\"Tom Hanks\"})-[:ACTED_IN]->(m)<-[:ACTED_IN]-(coActors),\n", - " (coActors)-[:ACTED_IN]->(m2)<-[:ACTED_IN]-(cocoActors)\n", - " WHERE NOT (tom)-[:ACTED_IN]->()<-[:ACTED_IN]-(cocoActors)\n", - " AND tom <> cocoActors\n", - " RETURN cocoActors.name AS Recommended, count(*) AS Strength\n", - " ORDER BY Strength DESC\n", - " ''')\n", - " .load())" - ], - "metadata": { - "id": "hplBy0b_dhnb" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Schema description" - ], - "metadata": { - "id": "tRZPA6xWeSCT" - } - }, - { - "cell_type": "code", - "source": [ - "cypher_df.printSchema()" - ], - "metadata": { - "id": "hU-JfgNNeL5f" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "cypher_df" - ], - "metadata": { - "id": "8IcUQsileXQ7" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "// Just for debugging purposes let's check the same query directly from the database\n", - "MATCH (tom:Person {name:\"Tom Hanks\"})-[:ACTED_IN]->(m)<-[:ACTED_IN]-(coActors),\n", - " (coActors)-[:ACTED_IN]->(m2)<-[:ACTED_IN]-(cocoActors)\n", - "WHERE NOT (tom)-[:ACTED_IN]->()<-[:ACTED_IN]-(cocoActors)\n", - " AND tom <> cocoActors\n", - "RETURN cocoActors.name AS Recommended, count(*) AS Strength\n", - "ORDER BY Strength DESC\n", - "LIMIT 20" - ], - "metadata": { - "id": "CWtNAeoN4O6S" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Exercise\n", - "\n", - "Return all the actors that have also directed a movie.\n", - "\n", - "The returned DataFrame must have 3 columns:\n", - "\n", - "* `name` the actor name\n", - "* `acted_in` a list of unique films (title) where he acted in\n", - "* `directed` a list of unique films (title) where he was a director" - ], - "metadata": { - "id": "Hyq4KsKQegdE" - } - }, - { - "cell_type": "code", - "source": [ - "your_cypher_df = # write your spark code here" - ], - "metadata": { - "id": "0h1FFuYxej2f" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "your_cypher_df = (spark.read\n", - " .format('org.neo4j.spark.DataSource')\n", - " .option('query', '''\n", - " MATCH (p:Person)\n", - " MATCH (p)-[:ACTED_IN]->(m:Movie)\n", - " MATCH (p)-[:DIRECTED]->(m1:Movie)\n", - " RETURN p.name AS name, collect(m.title) AS acted_in, collect(m1.title) AS directed\n", - " ''')\n", - " .load())\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "CQncCY52lCxv" - } - }, - { - "cell_type": "code", - "source": [ - "\"\"\"\n", - " This paragraph is for validating the code the you\n", - " wrote above, please execute it after you\n", - " created the your_cypher_df\n", - "\"\"\"\n", - "\n", - "assert your_cypher_df.count() == 5\n", - "assert your_cypher_df.schema.fieldNames() == ['name', 'acted_in', 'directed']\n", - "your_cypher_df_collect = your_cypher_df.collect()\n", - "assert frozenset(map(lambda row: row['name'], your_cypher_df_collect)) == frozenset(['Clint Eastwood',\n", - " 'Danny DeVito',\n", - " 'James Marshall',\n", - " 'Werner Herzog',\n", - " 'Tom Hanks'])\n", - "assert frozenset(map(lambda row: frozenset(row['acted_in']), your_cypher_df_collect)) == set([\n", - " frozenset([\"Apollo 13\", \"You've Got Mail\", \"A League of Their Own\", \"Joe Versus the Volcano\", \"That Thing You Do\", \"The Da Vinci Code\", \"Cloud Atlas\", \"Cast Away\", \"The Green Mile\", \"Sleepless in Seattle\", \"The Polar Express\", \"Charlie Wilson's War\"]),\n", - " frozenset([\"What Dreams May Come\"]),\n", - " frozenset([\"Unforgiven\"]),\n", - " frozenset([\"A Few Good Men\"]),\n", - " frozenset([\"Hoffa\", \"One Flew Over the Cuckoo's Nest\"])\n", - " ])\n", - "assert frozenset(map(lambda row: frozenset(row['directed']), your_cypher_df_collect)) == set([\n", - " frozenset([\"That Thing You Do\"]),\n", - " frozenset([\"RescueDawn\"]),\n", - " frozenset([\"Unforgiven\"]),\n", - " frozenset([\"V for Vendetta\", \"Ninja Assassin\"]),\n", - " frozenset([\"Hoffa\"])\n", - " ])\n", - "print(\"All assertion are successfuly satisfied. Congrats you created your first cypher dataframe\")" - ], - "metadata": { - "id": "xG_7Wy-_go5V" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hFpA11aK8ADf" - }, - "source": [ - "# Write data from Spark to Neo4j" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## The graph model\n", - "\n", - "Our goal is to create this simple graph model\n", - "\n", - "" - ], - "metadata": { - "id": "Mx84Qi1PcHF_" - } - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Trt-L_9pMQf1" - }, - "source": [ - "### Download The Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "h4o07NpuJmaG" - }, - "outputs": [], - "source": [ - "!wget -q https://s3.amazonaws.com/dev.assets.neo4j.com/wp-content/uploads/desktop-csv-import.zip" - ] - }, - { - "cell_type": "markdown", - "source": [ - "The zip is composed of three files:\n", - "* products.csv: describes the products and has three columns (and no header)\n", - "* orders.csv: has three columns (with the header) and describe the order\n", - "* order-details.csv: is the \"join\" table between orders and products; it has three columns with header" - ], - "metadata": { - "id": "KKfl_ZyhYYWj" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nduIG7H_J0-A" - }, - "outputs": [], - "source": [ - "!unzip desktop-csv-import.zip" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "w5kaTPoEQNkT" - }, - "source": [ - "### Explore the Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "L0TZgi_E1gAv" - }, - "outputs": [], - "source": [ - "products_df = (spark.read\n", - " .format('csv')\n", - " .option('inferSchema', True)\n", - " .option('path', '/content/desktop-csv-import/products.csv')\n", - " .load())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0tptDEUn2WO6" - }, - "outputs": [], - "source": [ - "products_df.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZSW40PxjKvgf" - }, - "outputs": [], - "source": [ - "products_df" - ] - }, - { - "cell_type": "markdown", - "source": [ - "As you can see in the schema, colums have no name, just a generic `_c` prefix concatenated with an index.\n", - "The three columns describe:\n", - "* `_c0` is the `id` of the product\n", - "* `_c1` is the `name`\n", - "* `_c2` is the `price`\n", - "\n", - "Let's rename these columns!" - ], - "metadata": { - "id": "VrKERh4uXu8J" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7xabRYlXQZl4" - }, - "outputs": [], - "source": [ - "products_df = (products_df.withColumnRenamed('_c0', 'id')\n", - " .withColumnRenamed('_c1', 'name')\n", - " .withColumnRenamed('_c2', 'price'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "w08HTcIeQuNn" - }, - "outputs": [], - "source": [ - "products_df.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XCkyYIyzQv0X" - }, - "outputs": [], - "source": [ - "products_df" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Write nodes via `label` option" - ], - "metadata": { - "id": "XL9oBe0-m680" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Oy4eoeAMRWxc" - }, - "outputs": [], - "source": [ - "(products_df.write\n", - " .format('org.neo4j.spark.DataSource')\n", - " .mode('append')\n", - " .option('labels', ':Product')\n", - " .save())" - ] - }, - { - "cell_type": "markdown", - "source": [ - "Let's check if the nodes are in the database!" - ], - "metadata": { - "id": "MLxewMeFsQj3" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (n:Product)\n", - "RETURN n\n", - "LIMIT 10" - ], - "metadata": { - "id": "2yoBPvmmsUqt" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Now just to be sure that we loaded all the nodes into Neo4j we'll count the dataframe and the nodes inside the database" - ], - "metadata": { - "id": "10PLvdmZ0tZT" - } - }, - { - "cell_type": "code", - "source": [ - "products_df.count()" - ], - "metadata": { - "id": "AmwadsK702Sl" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (n:Product)\n", - "RETURN count(n)" - ], - "metadata": { - "id": "J8e9jZPG06DL" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "If the two counts are equal, all the data has been properly imported." - ], - "metadata": { - "id": "T8XAaw3K1Mga" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Create Constraints\n", - "\n", - "Oh but wait, we forgot to create constraints!!! if we go into the Neo4j browser and excute the following query:\n", - "\n", - "```cypher\n", - "show constraints\n", - "```\n", - "\n", - "We should get the constraints of the movie database, but not one for `Product`.\n", - "\n", - "So please create the constaints for the node `Product`:\n", - "\n", - "```cypher\n", - "CREATE CONSTRAINT product_id FOR (p:Product) REQUIRE p.id IS UNIQUE;\n", - "```\n", - "\n", - "But if you want, you can also delegate the Spark connector to perform optimizations pre-processing by usign the option `schema.optimization.type` which can assume three values:\n", - "\n", - "* `INDEX`: it creates only indexes on provided nodes.\n", - "* `NODE_CONSTRAINTS`: it creates only indexes on provided nodes.\n", - "\n", - "So let's create the `Order` node with by let the connector creating the constraints for you" - ], - "metadata": { - "id": "jFL_tQk0tsni" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "// Check the constraints\n", - "SHOW CONSTRAINTS" - ], - "metadata": { - "id": "T3PUsfIvsi23" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "// Create the constraint for Product node\n", - "CREATE CONSTRAINT product_id IF NOT EXISTS FOR (p:Product) REQUIRE p.id IS UNIQUE;" - ], - "metadata": { - "id": "iMF68OU20XhE" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "// Check (again) the constraints\n", - "SHOW CONSTRAINTS" - ], - "metadata": { - "id": "w_wSMYvz0fz_" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "As you can see now we have the `product_id` constraint in the result list" - ], - "metadata": { - "id": "2JwYLMva1VnN" - } - }, - { - "cell_type": "code", - "source": [ - "orders_df = (spark.read\n", - " .format('csv')\n", - " .option('inferSchema', True)\n", - " .option('header', True)\n", - " .option('path', '/content/desktop-csv-import/orders.csv')\n", - " .load())" - ], - "metadata": { - "id": "m5ge2R_Ggd3K" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "orders_df.printSchema()" - ], - "metadata": { - "id": "7UGwHtuJwFU4" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "orders_df" - ], - "metadata": { - "id": "uGmUP5ZMwJkm" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# we cast orderDate to timestamp in order to have it converted properly into Neo4j\n", - "orders_df = orders_df.selectExpr('orderID AS id', 'CAST(orderDate AS TIMESTAMP) AS date', 'shipCountry')" - ], - "metadata": { - "id": "InwYglcUwXNy" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "orders_df.printSchema()" - ], - "metadata": { - "id": "kSJmW81cw_ES" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "orders_df" - ], - "metadata": { - "id": "7VN5RAizxSr2" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "(orders_df.write\n", - " .format('org.neo4j.spark.DataSource')\n", - " .mode('overwrite')\n", - " .option('labels', ':Order')\n", - " .option('schema.optimization.type', 'NODE_CONSTRAINTS')\n", - " # this is necessary in order to specify what is the constraint field\n", - " .option('node.keys', 'id')\n", - " .save())" - ], - "metadata": { - "id": "0rwKB9V-xTzu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Now let's check if the connector has created the constraint for us" - ], - "metadata": { - "id": "A_T2beEJx3Ho" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "SHOW CONSTRAINTS" - ], - "metadata": { - "id": "hDxdKyFY1sDT" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "As you can see the we have the constraint `spark_NODE_CONSTRAINTS_Order_id` that has been create by the Spark connector itself.\n", - "\n", - "Now just because we're courious let's check if the data has been propertly loaded.\n", - "\n", - "The first thing to check is if the count of the Dataframe and the nodes in Neo4j matches." - ], - "metadata": { - "id": "H5Bne-Mq1vsO" - } - }, - { - "cell_type": "code", - "source": [ - "orders_df.count()" - ], - "metadata": { - "id": "mANrH-Zt2ShO" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (o:Order)\n", - "RETURN count(o)" - ], - "metadata": { - "id": "vzCCVYAK2V0X" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Now we want to check if the data has been loaded with the proper data type, in particular we created a new column `date` by casting `orderDate` to `TIMESTAMP`." - ], - "metadata": { - "id": "X8BBxCQg2dFM" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (o:Order)\n", - "RETURN apoc.meta.cypher.type(o.date), count(o)" - ], - "metadata": { - "id": "VN7XMI192xAP" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "So all the `date` values have the same type." - ], - "metadata": { - "id": "PpXyKtJF3Lk4" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Exercise\n", - "\n", - "Given the `football_teams_df` and `football_player_df` below please:\n", - "* for `football_teams_df` insert it as nodes with label `:FootballTeam` in Neo4j.\n", - "* for `football_player_df` insert it as nodes with label `:FootballPlayer` in Neo4j.\n", - "\n", - "Create for both of them constraints via the schema optimization feature:\n", - "* for `football_teams_df` the key must be the property `id`\n", - "* for `football_player_df` the key must be the property `name`" - ], - "metadata": { - "id": "Zi4Dl2LqmVmN" - } - }, - { - "cell_type": "code", - "source": [ - "football_teams_df = spark.createDataFrame([{'id': 1, 'name': 'AC Milan'}, {'id': 2, 'name': 'FC Internazionale'}])\n", - "football_player_df = spark.createDataFrame([\n", - " {'name': 'Zlatan Ibrahimovic'},\n", - " {'name': 'Sandro Tonali'},\n", - " {'name': 'Nicolò Barella'},\n", - " {'name': 'Marcelo Brozovic'}])" - ], - "metadata": { - "id": "21tFDtgAmVON" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# write your spark code that persist football_teams_df and football_player_df here" - ], - "metadata": { - "id": "utpvz-fI6blD" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "# write the teams\n", - "(football_teams_df.write\n", - " .format('org.neo4j.spark.DataSource')\n", - " .mode('overwrite')\n", - " .option('labels', ':FootballTeam')\n", - " .option('schema.optimization.type', 'NODE_CONSTRAINTS')\n", - " .option('node.keys', 'id')\n", - " .save())\n", - "# write the players\n", - "(football_player_df.write\n", - " .format('org.neo4j.spark.DataSource')\n", - " .mode('overwrite')\n", - " .option('labels', ':FootballPlayer')\n", - " .option('schema.optimization.type', 'NODE_CONSTRAINTS')\n", - " .option('node.keys', 'name')\n", - " .save())\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "xeVxPe7PmEy8" - } - }, - { - "cell_type": "code", - "source": [ - "\"\"\"\n", - " This paragraph is for validating the code the you\n", - " wrote above, please execute it after you\n", - " persisted football_teams_df and\n", - " football_player_df in Neo4j as nodes\n", - "\"\"\"\n", - "\n", - "with neo4j_driver.session() as session:\n", - " # count football players\n", - " football_players = session.read_transaction(lambda tx: (tx.run('''\n", - " MATCH (p:FootballPlayer)\n", - " WHERE p.name IN ['Zlatan Ibrahimovic', 'Sandro Tonali',\n", - " 'Nicolò Barella', 'Marcelo Brozovic']\n", - " RETURN count(p) AS count\n", - " ''').single()['count']))\n", - " assert football_players == 4\n", - "\n", - " # count football teams\n", - " football_teams = session.read_transaction(lambda tx: (tx.run('''\n", - " MATCH (p:FootballTeam)\n", - " WHERE p.name IN ['AC Milan', 'FC Internazionale']\n", - " RETURN count(p) AS count\n", - " ''').single()['count']))\n", - " assert football_teams == 2\n", - "\n", - " # count constraints\n", - " football_constraints = session.read_transaction(lambda tx: (tx.run('''\n", - " SHOW CONSTRAINTS YIELD name\n", - " WHERE name IN ['spark_NODE_CONSTRAINTS_FootballPlayer_name', 'spark_NODE_CONSTRAINTS_FootballTeam_id']\n", - " RETURN count(*) AS count\n", - " ''').single()['count']))\n", - " assert football_constraints == 2\n", - "\n", - "print(\"All assertion are successfuly satisfied. Congrats you saved your first Node DataFrame into Neo4j!\")" - ], - "metadata": { - "id": "a5zQCEyK6h5f" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Write relationships via `relationship` option" - ], - "metadata": { - "id": "QltYkhIuy2Kc" - } - }, - { - "cell_type": "code", - "source": [ - "order_details_df = (spark.read\n", - " .format('csv')\n", - " .option('inferSchema', True)\n", - " .option('header', True)\n", - " .option('path', '/content/desktop-csv-import/order-details.csv')\n", - " .load())" - ], - "metadata": { - "id": "y3U_X0b7x2UN" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "order_details_df.printSchema()" - ], - "metadata": { - "id": "zFqKW-j-zRyk" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "order_details_df" - ], - "metadata": { - "id": "5s5f3W984Jc0" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Please remember that this is the pattern that we want to ingest:\n", - "\n", - "\n", - "" - ], - "metadata": { - "id": "aZWSOtNK2qEw" - } - }, - { - "cell_type": "code", - "source": [ - "(order_details_df.write\n", - " .format('org.neo4j.spark.DataSource')\n", - " .mode('overwrite')\n", - " .option('relationship', 'CONTAINS')\n", - " .option('relationship.save.strategy', 'keys')\n", - " .option('relationship.source.labels', ':Product')\n", - " .option('relationship.source.save.mode', 'Match')\n", - " .option('relationship.source.node.keys', 'productID:id')\n", - " .option('relationship.target.labels', ':Order')\n", - " .option('relationship.target.save.mode', 'Match')\n", - " .option('relationship.target.node.keys', 'orderID:id')\n", - " .option('relationship.properties', 'quantity:quantityOrdered')\n", - " .save())" - ], - "metadata": { - "id": "rFo3KWA90rmZ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Now let's check the count for both Dataframe and relationships in Neo4j" - ], - "metadata": { - "id": "1PaC36iZ3bNf" - } - }, - { - "cell_type": "code", - "source": [ - "order_details_df.count()" - ], - "metadata": { - "id": "OUZ5FYHP3qvj" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (p:Product)-[r:CONTAINS]->(o:Order)\n", - "RETURN count(r)" - ], - "metadata": { - "id": "ex8o2pSo34cI" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Exercise\n", - "\n", - "Given the `team_player_df` create a relationship between `:FootballPlayer` and `:FootballTeam` of type `PLAYS_FOR`:\n", - "\n", - "```cypher\n", - "(:FootballPlayer)-[:PLAYS_FOR]->(:FootballTeam)\n", - "```" - ], - "metadata": { - "id": "lTBo369687lC" - } - }, - { - "cell_type": "code", - "source": [ - "team_player_df = spark.createDataFrame([\n", - " {'id': 1, 'football_player': 'Zlatan Ibrahimovic'},\n", - " {'id': 1, 'football_player': 'Sandro Tonali'},\n", - " {'id': 2, 'football_player': 'Nicolò Barella'},\n", - " {'id': 2, 'football_player': 'Marcelo Brozovic'}])" - ], - "metadata": { - "id": "gi9kB0l49f8H" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# write your spark code that persist team_player_df here" - ], - "metadata": { - "id": "i7ZjASDx_Kiy" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "(team_player_df.write\n", - " .format('org.neo4j.spark.DataSource')\n", - " .mode('overwrite')\n", - " .option('relationship', 'PLAYS_FOR')\n", - " .option('relationship.save.strategy', 'keys')\n", - " .option('relationship.source.labels', ':FootballPlayer')\n", - " .option('relationship.source.save.mode', 'Match')\n", - " .option('relationship.source.node.keys', 'football_player:name')\n", - " .option('relationship.target.labels', ':FootballTeam')\n", - " .option('relationship.target.save.mode', 'Match')\n", - " .option('relationship.target.node.keys', 'id')\n", - " .save())\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "oNRJevUSm0Wi" - } - }, - { - "cell_type": "code", - "source": [ - "\"\"\"\n", - " This paragraph is for validating the code the you\n", - " wrote above, please execute it after you\n", - " persisted team_player_df as relationships\n", - "\"\"\"\n", - "\n", - "with neo4j_driver.session() as session:\n", - " # count relationships\n", - " def count_relationships(tx):\n", - " result = tx.run('''\n", - " MATCH (p:FootballPlayer)-[:PLAYS_FOR]->(t:FootballTeam)\n", - " RETURN t.name AS team, collect(p.name) AS players\n", - " ORDER by team\n", - " ''')\n", - " return [{'team': record['team'], 'players': set(record['players'])} for record in result]\n", - "\n", - " actual = session.read_transaction(count_relationships)\n", - " expected = [\n", - " {'team': 'AC Milan', 'players': frozenset(['Zlatan Ibrahimovic', 'Sandro Tonali'])},\n", - " {'team': 'FC Internazionale', 'players': frozenset(['Nicolò Barella', 'Marcelo Brozovic'])}\n", - " ]\n", - " assert actual == expected\n", - "\n", - "print(\"All assertion are successfuly satisfied. Congrats you saved your first Relationship DataFrame into Neo4j!\")" - ], - "metadata": { - "id": "LDYbmoUx_Owb" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Write custom graphs via Cypher Query\n", - "\n", - "Now let's consider that two actors created an order and bought several products, and we want to add information in our database." - ], - "metadata": { - "id": "vBq9NCWlZkHw" - } - }, - { - "cell_type": "code", - "source": [ - "actor_orders = [\n", - " {'actor_name': 'Cuba Gooding Jr.', 'order_id': 1, 'products': [11, 42, 72], 'quantities': [1, 2, 3], 'order_date': '2022-06-07 00:00:00'},\n", - " {'actor_name': 'Tom Hanks', 'order_id': 2, 'products': [24, 55, 75], 'quantities': [3, 2, 1], 'order_date': '2022-06-06 00:00:00'}\n", - "]\n", - "\n", - "actor_orders_df = spark.createDataFrame(actor_orders)" - ], - "metadata": { - "id": "3y_yEOouaHxe" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "actor_orders_df.printSchema()" - ], - "metadata": { - "id": "hYo7nlgvbfdm" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "actor_orders_df" - ], - "metadata": { - "id": "fOkW4w2lbhZP" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "In this case please go into Neo4j and create the following constraint:\n", - "\n", - "```cypher\n", - "CREATE CONSTRAINT person_name FOR (p:Person) REQUIRE p.name is UNIQUE;\n", - "```" - ], - "metadata": { - "id": "q0wAIB5l7qp9" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "// if you didn't before create the constraint on Person.name\n", - "CREATE CONSTRAINT person_name IF NOT EXISTS FOR (p:Person) REQUIRE p.name is UNIQUE;" - ], - "metadata": { - "id": "-_Lxm_zV4qw2" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "(actor_orders_df.write\n", - " .format('org.neo4j.spark.DataSource')\n", - " .mode('overwrite')\n", - " .option('query', '''\n", - " MATCH (person:Person {name: event.actor_name})\n", - " MERGE (order:Order {id: event.order_id, date: datetime(replace(event.order_date, ' ', 'T'))})\n", - " MERGE (person)-[:CREATED]->(order)\n", - " WITH event, order\n", - " UNWIND range(0, size(event.products) - 1) AS index\n", - " MATCH (product:Product {id: event.products[index]})\n", - " MERGE (product)-[:CONTAINS{quantityOrdered: event.quantities[index]}]->(order)\n", - " ''')\n", - " .save())" - ], - "metadata": { - "id": "cm3RQyLbbjue" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "What we expect now is that for the two actors there are two orders one per each, then each order contains three products." - ], - "metadata": { - "id": "61LUSN4F6pQq" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (a:Person)-[:CREATED]->(o:Order)<-[c:CONTAINS]-(p:Product)\n", - "WHERE a.name IN ['Cuba Gooding Jr.', 'Tom Hanks']\n", - "RETURN a.name, o.id, o.date, p.name, c.quantityOrdered" - ], - "metadata": { - "id": "8fVMQiTf61mN" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Exercise\n", - "\n", - "Given `neo4j_resources_df` build a small Knowledge Graph in Neo4j with the following structure:\n", - "\n", - "```cypher\n", - "(:Author{name})-[:CREATED]->(:Resource{name})-[:HAS_TAG]->(:Tag{name})\n", - "```" - ], - "metadata": { - "id": "pn7IM8me9R3I" - } - }, - { - "cell_type": "code", - "source": [ - "neo4j_resources_df = spark.createDataFrame([\n", - " {'author': 'LARUS Business Automation', 'resource': 'Galileo.XAI', 'tags': ['Graph Machine Learning', 'Neo4j', 'Explainable AI', 'Artificial Intelligence']},\n", - " {'author': 'Neo4j', 'resource': 'Graph Data Science Library', 'tags': ['Graph Machine Learning', 'Algorithms']},\n", - " {'author': 'Michael Hunger', 'resource': 'APOC', 'tags': ['Graph Data Integration', 'Graph Algorithms']}\n", - "])" - ], - "metadata": { - "id": "_wmoLl8d9RVz" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "neo4j_resources_df" - ], - "metadata": { - "id": "NkG1jCynLXgJ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# write your spark code that persist neo4j_resources_df here" - ], - "metadata": { - "id": "8EQmY-qhsbi1" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "(neo4j_resources_df.write\n", - " .format('org.neo4j.spark.DataSource')\n", - " .mode('overwrite')\n", - " .option('query', '''\n", - " MERGE (a:Author {name: event.author})\n", - " MERGE (r:Resource {name: event.resource})\n", - " MERGE (a)-[:CREATED]->(r)\n", - " WITH a, r, event\n", - " UNWIND event.tags AS tag\n", - " MERGE (t:Tag{name: tag})\n", - " MERGE (r)-[:HAS_TAG]->(t)\n", - " ''')\n", - " .save())\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "KCfw_saanywo" - } - }, - { - "cell_type": "code", - "source": [ - "\"\"\"\n", - " This paragraph is for validating the code the you\n", - " wrote above, please execute it after you\n", - " persisted neo4j_resources_df as Cypher query\n", - "\"\"\"\n", - "\n", - "with neo4j_driver.session() as session:\n", - " # count relationships\n", - " def check_graph_consistency(tx):\n", - " result = tx.run('''\n", - " MATCH (a:Author)-[:CREATED]->(r:Resource)-[:HAS_TAG]->(t:Tag)\n", - " RETURN a.name AS author, r.name AS resource, collect(t.name) AS tags\n", - " ORDER By author\n", - " ''')\n", - " return [{'author': record['author'], 'resource': record['resource'], 'tags': set(record['tags'])} for record in result]\n", - "\n", - " actual = session.read_transaction(check_graph_consistency)\n", - " expected = [\n", - " {'author': 'LARUS Business Automation', 'resource': 'Galileo.XAI', 'tags': frozenset(['Graph Machine Learning', 'Neo4j', 'Explainable AI', 'Artificial Intelligence'])},\n", - " {'author': 'Michael Hunger', 'resource': 'APOC', 'tags': frozenset(['Graph Data Integration', 'Graph Algorithms'])},\n", - " {'author': 'Neo4j', 'resource': 'Graph Data Science Library', 'tags': frozenset(['Graph Machine Learning', 'Algorithms'])}\n", - " ]\n", - " assert actual == expected\n", - "\n", - "print(\"All assertion are successfuly satisfied. Congrats you saved your first Knowledge Graph DataFrame into Neo4j!\")" - ], - "metadata": { - "id": "LwqbSsEcsgDi" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "_LbqufNZMj6-" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "colab": { - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Open this notebook in Google Colab \n", + " \"Open\n", + "" + ], + "metadata": { + "id": "EhTThKJMxDCy" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7Nvb-_bYx359" + }, + "source": [ + "# Example of a Simple data engineering workflow with Neo4j and Spark" + ] + }, + { + "cell_type": "markdown", + "source": [ + "This notebook contains a set of examples that explains how to extract insights from data using the Neo4j Connector for Apache Spark in a Data Engineering workflow with [AuraDB](https://neo4j.com/docs/aura/auradb/) our fully managed version of Neo4j database.\n", + "\n", + "The notebooks will enable you to test your knowledge with a set of exercises after each section.\n", + "\n", + "If you have any questions or problems feel free to write a post in the [Neo4j community forum](https://community.neo4j.com/) or in [Discord](https://discord.com/invite/neo4j).\n", + "\n", + "If you want more exercises feel free to open an issue in the [GitHub repository](https://github.com/neo4j/neo4j-spark-connector).\n", + "\n", + "Enjoy!" + ], + "metadata": { + "id": "e0bo6ido8tL7" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hXwkjQMnMXED" + }, + "source": [ + "### Configure the Spark Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BhZwh-RAz6Bo" + }, + "outputs": [], + "source": [ + "!apt-get install openjdk-17-jdk-headless -qq > /dev/null" + ] + }, + { + "cell_type": "code", + "source": [ + "spark_version = '3.3.4'" + ], + "metadata": { + "id": "gmEzhrux7Jek" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!wget -q https://dlcdn.apache.org/spark/spark-$spark_version/spark-$spark_version-bin-hadoop3.tgz" + ], + "metadata": { + "id": "Ya6Nj_u3vdTL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "A3gsnSHl0F99" + }, + "outputs": [], + "source": [ + "!tar xf spark-$spark_version-bin-hadoop3.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hSBQWKs90vSx" + }, + "outputs": [], + "source": [ + "!pip install -q findspark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tnW0a1Gj080k" + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-17-openjdk-amd64\"\n", + "os.environ[\"SPARK_HOME\"] = f\"/content/spark-{spark_version}-bin-hadoop3\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dlUBSezK1DpZ" + }, + "outputs": [], + "source": [ + "import findspark\n", + "findspark.init()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rd5KWGQiOVDV" + }, + "outputs": [], + "source": [ + "neo4j_url = \"\" # put your neo4j url here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uXbi_82KOTzU" + }, + "outputs": [], + "source": [ + "neo4j_user = \"neo4j\" # put your neo4j user here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Sw50wjxxOUqt" + }, + "outputs": [], + "source": [ + "neo4j_password = \"\" # put your neo4j password here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dOUJ-W871Tur" + }, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = (SparkSession.builder\n", + " .master('local[*]')\n", + " .appName('Data engineering workflow with Neo4j and Spark')\n", + " .config('spark.ui.port', '4050')\n", + " # Just to show dataframes as tables\n", + " .config('spark.sql.repl.eagerEval.enabled', True)\n", + " .config('spark.jars.packages', 'org.neo4j:neo4j-connector-apache-spark_2.13:6.0.0_for_spark_4')\n", + " # As we're using always the same database instance we'll\n", + " # define them as global variables\n", + " # so we don't need to repeat them each time\n", + " .config(\"neo4j.url\", neo4j_url)\n", + " .config(\"neo4j.authentication.type\", \"basic\")\n", + " .config(\"neo4j.authentication.basic.username\", neo4j_user)\n", + " .config(\"neo4j.authentication.basic.password\", neo4j_password)\n", + " .getOrCreate())\n", + "spark" + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "## Exercises prerequisite\n", + "\n", + "In this notebook we and going to test your knowledge. Some of the exercises require the Neo4j Python driver to check if the exercises are being solved correctly.\n", + "\n", + "*Neo4j Python Driver is required only for verifying the exercises when you persist data from Spark to Neo4j*\n", + "\n", + "**It's not required by the Spark connector!!!**\n", + "\n", + "We'll use [Cy2Py](https://github.com/conker84/cy2py), a Jupyter extension that easily allows you to connect to Neo4j and visualize data from Jupyter notebooks.\n", + "For a detailed instruction about how to use it please dive into [this example](https://github.com/conker84/cy2py/blob/main/examples/Neo4j_Crime_Investigation_Dataset.ipynb)" + ], + "metadata": { + "id": "b6_YNZnZ5GdT" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install -q cy2py" + ], + "metadata": { + "id": "f5ZZJylo5Bbz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CsnO4C9X7vK0" + }, + "source": [ + "### Configure an Aura instance\n", + "\n", + "
\n", + "

Neo4j Aura DB is a fully managed cloud service: The zero-admin, always-on graph database for cloud developers.

\n", + "\n", + "Create a [free instance](https://console.neo4j.io/?ref=aura-lp&mpp=4bfb2414ab973c741b6f067bf06d5575&mpid=17f40ce03ac883-0f09bb214466c1-37677109-1ea000-17f40ce03ad975&_gl=1*ql4f6s*_ga*MTc2OTMwNjEwMy4xNjQ5NDI3MDE0*_ga_DL38Q8KGQC*MTY1MzQxMDQzMC43OS4xLjE2NTM0MTA3MjQuMA..&_ga=2.136543024.1659283742.1653295079-1769306103.1649427014&_gac=1.216269284.1653306922.CjwKCAjw4ayUBhA4EiwATWyBrl6dN0oaH9_btCfvzdhi77ieNP07GAkOYuz7wx9QEewBnG_FUIMg8xoCgLsQAvD_BwE)\n", + "\n", + "
" + ] + }, + { + "cell_type": "markdown", + "source": [ + "let's load the extension" + ], + "metadata": { + "id": "uKYEPEgOcG2b" + } + }, + { + "cell_type": "code", + "source": [ + "%load_ext cy2py" + ], + "metadata": { + "id": "38EeXF6icKOK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Populate the database\n", + "\n", + "To perform this section go in the Neo4j Brower of your aura instance and paste the following query:\n", + "\n", + "
\n", + "\n", + "Show the Cypher query\n", + "\n", + "\n", + "```cypher\n", + "CREATE (TheMatrix:Movie {title:'The Matrix', released:1999, tagline:'Welcome to the Real World'})\n", + "CREATE (Keanu:Person {name:'Keanu Reeves', born:1964})\n", + "CREATE (Carrie:Person {name:'Carrie-Anne Moss', born:1967})\n", + "CREATE (Laurence:Person {name:'Laurence Fishburne', born:1961})\n", + "CREATE (Hugo:Person {name:'Hugo Weaving', born:1960})\n", + "CREATE (LillyW:Person {name:'Lilly Wachowski', born:1967})\n", + "CREATE (LanaW:Person {name:'Lana Wachowski', born:1965})\n", + "CREATE (JoelS:Person {name:'Joel Silver', born:1952})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrix),\n", + "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrix),\n", + "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrix),\n", + "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrix),\n", + "(LillyW)-[:DIRECTED]->(TheMatrix),\n", + "(LanaW)-[:DIRECTED]->(TheMatrix),\n", + "(JoelS)-[:PRODUCED]->(TheMatrix)\n", + "\n", + "CREATE (Emil:Person {name:\"Emil Eifrem\", born:1978})\n", + "CREATE (Emil)-[:ACTED_IN {roles:[\"Emil\"]}]->(TheMatrix)\n", + "\n", + "CREATE (TheMatrixReloaded:Movie {title:'The Matrix Reloaded', released:2003, tagline:'Free your mind'})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrixReloaded),\n", + "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrixReloaded),\n", + "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrixReloaded),\n", + "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrixReloaded),\n", + "(LillyW)-[:DIRECTED]->(TheMatrixReloaded),\n", + "(LanaW)-[:DIRECTED]->(TheMatrixReloaded),\n", + "(JoelS)-[:PRODUCED]->(TheMatrixReloaded)\n", + "\n", + "CREATE (TheMatrixRevolutions:Movie {title:'The Matrix Revolutions', released:2003, tagline:'Everything that has a beginning has an end'})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrixRevolutions),\n", + "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrixRevolutions),\n", + "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrixRevolutions),\n", + "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrixRevolutions),\n", + "(LillyW)-[:DIRECTED]->(TheMatrixRevolutions),\n", + "(LanaW)-[:DIRECTED]->(TheMatrixRevolutions),\n", + "(JoelS)-[:PRODUCED]->(TheMatrixRevolutions)\n", + "\n", + "CREATE (TheDevilsAdvocate:Movie {title:\"The Devil's Advocate\", released:1997, tagline:'Evil has its winning ways'})\n", + "CREATE (Charlize:Person {name:'Charlize Theron', born:1975})\n", + "CREATE (Al:Person {name:'Al Pacino', born:1940})\n", + "CREATE (Taylor:Person {name:'Taylor Hackford', born:1944})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Kevin Lomax']}]->(TheDevilsAdvocate),\n", + "(Charlize)-[:ACTED_IN {roles:['Mary Ann Lomax']}]->(TheDevilsAdvocate),\n", + "(Al)-[:ACTED_IN {roles:['John Milton']}]->(TheDevilsAdvocate),\n", + "(Taylor)-[:DIRECTED]->(TheDevilsAdvocate)\n", + "\n", + "CREATE (AFewGoodMen:Movie {title:\"A Few Good Men\", released:1992, tagline:\"In the heart of the nation's capital, in a courthouse of the U.S. government, one man will stop at nothing to keep his honor, and one will stop at nothing to find the truth.\"})\n", + "CREATE (TomC:Person {name:'Tom Cruise', born:1962})\n", + "CREATE (JackN:Person {name:'Jack Nicholson', born:1937})\n", + "CREATE (DemiM:Person {name:'Demi Moore', born:1962})\n", + "CREATE (KevinB:Person {name:'Kevin Bacon', born:1958})\n", + "CREATE (KieferS:Person {name:'Kiefer Sutherland', born:1966})\n", + "CREATE (NoahW:Person {name:'Noah Wyle', born:1971})\n", + "CREATE (CubaG:Person {name:'Cuba Gooding Jr.', born:1968})\n", + "CREATE (KevinP:Person {name:'Kevin Pollak', born:1957})\n", + "CREATE (JTW:Person {name:'J.T. Walsh', born:1943})\n", + "CREATE (JamesM:Person {name:'James Marshall', born:1967})\n", + "CREATE (ChristopherG:Person {name:'Christopher Guest', born:1948})\n", + "CREATE (RobR:Person {name:'Rob Reiner', born:1947})\n", + "CREATE (AaronS:Person {name:'Aaron Sorkin', born:1961})\n", + "CREATE\n", + "(TomC)-[:ACTED_IN {roles:['Lt. Daniel Kaffee']}]->(AFewGoodMen),\n", + "(JackN)-[:ACTED_IN {roles:['Col. Nathan R. Jessup']}]->(AFewGoodMen),\n", + "(DemiM)-[:ACTED_IN {roles:['Lt. Cdr. JoAnne Galloway']}]->(AFewGoodMen),\n", + "(KevinB)-[:ACTED_IN {roles:['Capt. Jack Ross']}]->(AFewGoodMen),\n", + "(KieferS)-[:ACTED_IN {roles:['Lt. Jonathan Kendrick']}]->(AFewGoodMen),\n", + "(NoahW)-[:ACTED_IN {roles:['Cpl. Jeffrey Barnes']}]->(AFewGoodMen),\n", + "(CubaG)-[:ACTED_IN {roles:['Cpl. Carl Hammaker']}]->(AFewGoodMen),\n", + "(KevinP)-[:ACTED_IN {roles:['Lt. Sam Weinberg']}]->(AFewGoodMen),\n", + "(JTW)-[:ACTED_IN {roles:['Lt. Col. Matthew Andrew Markinson']}]->(AFewGoodMen),\n", + "(JamesM)-[:ACTED_IN {roles:['Pfc. Louden Downey']}]->(AFewGoodMen),\n", + "(ChristopherG)-[:ACTED_IN {roles:['Dr. Stone']}]->(AFewGoodMen),\n", + "(AaronS)-[:ACTED_IN {roles:['Man in Bar']}]->(AFewGoodMen),\n", + "(RobR)-[:DIRECTED]->(AFewGoodMen),\n", + "(AaronS)-[:WROTE]->(AFewGoodMen)\n", + "\n", + "CREATE (TopGun:Movie {title:\"Top Gun\", released:1986, tagline:'I feel the need, the need for speed.'})\n", + "CREATE (KellyM:Person {name:'Kelly McGillis', born:1957})\n", + "CREATE (ValK:Person {name:'Val Kilmer', born:1959})\n", + "CREATE (AnthonyE:Person {name:'Anthony Edwards', born:1962})\n", + "CREATE (TomS:Person {name:'Tom Skerritt', born:1933})\n", + "CREATE (MegR:Person {name:'Meg Ryan', born:1961})\n", + "CREATE (TonyS:Person {name:'Tony Scott', born:1944})\n", + "CREATE (JimC:Person {name:'Jim Cash', born:1941})\n", + "CREATE\n", + "(TomC)-[:ACTED_IN {roles:['Maverick']}]->(TopGun),\n", + "(KellyM)-[:ACTED_IN {roles:['Charlie']}]->(TopGun),\n", + "(ValK)-[:ACTED_IN {roles:['Iceman']}]->(TopGun),\n", + "(AnthonyE)-[:ACTED_IN {roles:['Goose']}]->(TopGun),\n", + "(TomS)-[:ACTED_IN {roles:['Viper']}]->(TopGun),\n", + "(MegR)-[:ACTED_IN {roles:['Carole']}]->(TopGun),\n", + "(TonyS)-[:DIRECTED]->(TopGun),\n", + "(JimC)-[:WROTE]->(TopGun)\n", + "\n", + "CREATE (JerryMaguire:Movie {title:'Jerry Maguire', released:2000, tagline:'The rest of his life begins now.'})\n", + "CREATE (ReneeZ:Person {name:'Renee Zellweger', born:1969})\n", + "CREATE (KellyP:Person {name:'Kelly Preston', born:1962})\n", + "CREATE (JerryO:Person {name:\"Jerry O'Connell\", born:1974})\n", + "CREATE (JayM:Person {name:'Jay Mohr', born:1970})\n", + "CREATE (BonnieH:Person {name:'Bonnie Hunt', born:1961})\n", + "CREATE (ReginaK:Person {name:'Regina King', born:1971})\n", + "CREATE (JonathanL:Person {name:'Jonathan Lipnicki', born:1996})\n", + "CREATE (CameronC:Person {name:'Cameron Crowe', born:1957})\n", + "CREATE\n", + "(TomC)-[:ACTED_IN {roles:['Jerry Maguire']}]->(JerryMaguire),\n", + "(CubaG)-[:ACTED_IN {roles:['Rod Tidwell']}]->(JerryMaguire),\n", + "(ReneeZ)-[:ACTED_IN {roles:['Dorothy Boyd']}]->(JerryMaguire),\n", + "(KellyP)-[:ACTED_IN {roles:['Avery Bishop']}]->(JerryMaguire),\n", + "(JerryO)-[:ACTED_IN {roles:['Frank Cushman']}]->(JerryMaguire),\n", + "(JayM)-[:ACTED_IN {roles:['Bob Sugar']}]->(JerryMaguire),\n", + "(BonnieH)-[:ACTED_IN {roles:['Laurel Boyd']}]->(JerryMaguire),\n", + "(ReginaK)-[:ACTED_IN {roles:['Marcee Tidwell']}]->(JerryMaguire),\n", + "(JonathanL)-[:ACTED_IN {roles:['Ray Boyd']}]->(JerryMaguire),\n", + "(CameronC)-[:DIRECTED]->(JerryMaguire),\n", + "(CameronC)-[:PRODUCED]->(JerryMaguire),\n", + "(CameronC)-[:WROTE]->(JerryMaguire)\n", + "\n", + "CREATE (StandByMe:Movie {title:\"Stand By Me\", released:1986, tagline:\"For some, it's the last real taste of innocence, and the first real taste of life. But for everyone, it's the time that memories are made of.\"})\n", + "CREATE (RiverP:Person {name:'River Phoenix', born:1970})\n", + "CREATE (CoreyF:Person {name:'Corey Feldman', born:1971})\n", + "CREATE (WilW:Person {name:'Wil Wheaton', born:1972})\n", + "CREATE (JohnC:Person {name:'John Cusack', born:1966})\n", + "CREATE (MarshallB:Person {name:'Marshall Bell', born:1942})\n", + "CREATE\n", + "(WilW)-[:ACTED_IN {roles:['Gordie Lachance']}]->(StandByMe),\n", + "(RiverP)-[:ACTED_IN {roles:['Chris Chambers']}]->(StandByMe),\n", + "(JerryO)-[:ACTED_IN {roles:['Vern Tessio']}]->(StandByMe),\n", + "(CoreyF)-[:ACTED_IN {roles:['Teddy Duchamp']}]->(StandByMe),\n", + "(JohnC)-[:ACTED_IN {roles:['Denny Lachance']}]->(StandByMe),\n", + "(KieferS)-[:ACTED_IN {roles:['Ace Merrill']}]->(StandByMe),\n", + "(MarshallB)-[:ACTED_IN {roles:['Mr. Lachance']}]->(StandByMe),\n", + "(RobR)-[:DIRECTED]->(StandByMe)\n", + "\n", + "CREATE (AsGoodAsItGets:Movie {title:'As Good as It Gets', released:1997, tagline:'A comedy from the heart that goes for the throat.'})\n", + "CREATE (HelenH:Person {name:'Helen Hunt', born:1963})\n", + "CREATE (GregK:Person {name:'Greg Kinnear', born:1963})\n", + "CREATE (JamesB:Person {name:'James L. Brooks', born:1940})\n", + "CREATE\n", + "(JackN)-[:ACTED_IN {roles:['Melvin Udall']}]->(AsGoodAsItGets),\n", + "(HelenH)-[:ACTED_IN {roles:['Carol Connelly']}]->(AsGoodAsItGets),\n", + "(GregK)-[:ACTED_IN {roles:['Simon Bishop']}]->(AsGoodAsItGets),\n", + "(CubaG)-[:ACTED_IN {roles:['Frank Sachs']}]->(AsGoodAsItGets),\n", + "(JamesB)-[:DIRECTED]->(AsGoodAsItGets)\n", + "\n", + "CREATE (WhatDreamsMayCome:Movie {title:'What Dreams May Come', released:1998, tagline:'After life there is more. The end is just the beginning.'})\n", + "CREATE (AnnabellaS:Person {name:'Annabella Sciorra', born:1960})\n", + "CREATE (MaxS:Person {name:'Max von Sydow', born:1929})\n", + "CREATE (WernerH:Person {name:'Werner Herzog', born:1942})\n", + "CREATE (Robin:Person {name:'Robin Williams', born:1951})\n", + "CREATE (VincentW:Person {name:'Vincent Ward', born:1956})\n", + "CREATE\n", + "(Robin)-[:ACTED_IN {roles:['Chris Nielsen']}]->(WhatDreamsMayCome),\n", + "(CubaG)-[:ACTED_IN {roles:['Albert Lewis']}]->(WhatDreamsMayCome),\n", + "(AnnabellaS)-[:ACTED_IN {roles:['Annie Collins-Nielsen']}]->(WhatDreamsMayCome),\n", + "(MaxS)-[:ACTED_IN {roles:['The Tracker']}]->(WhatDreamsMayCome),\n", + "(WernerH)-[:ACTED_IN {roles:['The Face']}]->(WhatDreamsMayCome),\n", + "(VincentW)-[:DIRECTED]->(WhatDreamsMayCome)\n", + "\n", + "CREATE (SnowFallingonCedars:Movie {title:'Snow Falling on Cedars', released:1999, tagline:'First loves last. Forever.'})\n", + "CREATE (EthanH:Person {name:'Ethan Hawke', born:1970})\n", + "CREATE (RickY:Person {name:'Rick Yune', born:1971})\n", + "CREATE (JamesC:Person {name:'James Cromwell', born:1940})\n", + "CREATE (ScottH:Person {name:'Scott Hicks', born:1953})\n", + "CREATE\n", + "(EthanH)-[:ACTED_IN {roles:['Ishmael Chambers']}]->(SnowFallingonCedars),\n", + "(RickY)-[:ACTED_IN {roles:['Kazuo Miyamoto']}]->(SnowFallingonCedars),\n", + "(MaxS)-[:ACTED_IN {roles:['Nels Gudmundsson']}]->(SnowFallingonCedars),\n", + "(JamesC)-[:ACTED_IN {roles:['Judge Fielding']}]->(SnowFallingonCedars),\n", + "(ScottH)-[:DIRECTED]->(SnowFallingonCedars)\n", + "\n", + "CREATE (YouveGotMail:Movie {title:\"You've Got Mail\", released:1998, tagline:'At odds in life... in love on-line.'})\n", + "CREATE (ParkerP:Person {name:'Parker Posey', born:1968})\n", + "CREATE (DaveC:Person {name:'Dave Chappelle', born:1973})\n", + "CREATE (SteveZ:Person {name:'Steve Zahn', born:1967})\n", + "CREATE (TomH:Person {name:'Tom Hanks', born:1956})\n", + "CREATE (NoraE:Person {name:'Nora Ephron', born:1941})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Joe Fox']}]->(YouveGotMail),\n", + "(MegR)-[:ACTED_IN {roles:['Kathleen Kelly']}]->(YouveGotMail),\n", + "(GregK)-[:ACTED_IN {roles:['Frank Navasky']}]->(YouveGotMail),\n", + "(ParkerP)-[:ACTED_IN {roles:['Patricia Eden']}]->(YouveGotMail),\n", + "(DaveC)-[:ACTED_IN {roles:['Kevin Jackson']}]->(YouveGotMail),\n", + "(SteveZ)-[:ACTED_IN {roles:['George Pappas']}]->(YouveGotMail),\n", + "(NoraE)-[:DIRECTED]->(YouveGotMail)\n", + "\n", + "CREATE (SleeplessInSeattle:Movie {title:'Sleepless in Seattle', released:1993, tagline:'What if someone you never met, someone you never saw, someone you never knew was the only someone for you?'})\n", + "CREATE (RitaW:Person {name:'Rita Wilson', born:1956})\n", + "CREATE (BillPull:Person {name:'Bill Pullman', born:1953})\n", + "CREATE (VictorG:Person {name:'Victor Garber', born:1949})\n", + "CREATE (RosieO:Person {name:\"Rosie O'Donnell\", born:1962})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Sam Baldwin']}]->(SleeplessInSeattle),\n", + "(MegR)-[:ACTED_IN {roles:['Annie Reed']}]->(SleeplessInSeattle),\n", + "(RitaW)-[:ACTED_IN {roles:['Suzy']}]->(SleeplessInSeattle),\n", + "(BillPull)-[:ACTED_IN {roles:['Walter']}]->(SleeplessInSeattle),\n", + "(VictorG)-[:ACTED_IN {roles:['Greg']}]->(SleeplessInSeattle),\n", + "(RosieO)-[:ACTED_IN {roles:['Becky']}]->(SleeplessInSeattle),\n", + "(NoraE)-[:DIRECTED]->(SleeplessInSeattle)\n", + "\n", + "CREATE (JoeVersustheVolcano:Movie {title:'Joe Versus the Volcano', released:1990, tagline:'A story of love, lava and burning desire.'})\n", + "CREATE (JohnS:Person {name:'John Patrick Stanley', born:1950})\n", + "CREATE (Nathan:Person {name:'Nathan Lane', born:1956})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Joe Banks']}]->(JoeVersustheVolcano),\n", + "(MegR)-[:ACTED_IN {roles:['DeDe', 'Angelica Graynamore', 'Patricia Graynamore']}]->(JoeVersustheVolcano),\n", + "(Nathan)-[:ACTED_IN {roles:['Baw']}]->(JoeVersustheVolcano),\n", + "(JohnS)-[:DIRECTED]->(JoeVersustheVolcano)\n", + "\n", + "CREATE (WhenHarryMetSally:Movie {title:'When Harry Met Sally', released:1998, tagline:'Can two friends sleep together and still love each other in the morning?'})\n", + "CREATE (BillyC:Person {name:'Billy Crystal', born:1948})\n", + "CREATE (CarrieF:Person {name:'Carrie Fisher', born:1956})\n", + "CREATE (BrunoK:Person {name:'Bruno Kirby', born:1949})\n", + "CREATE\n", + "(BillyC)-[:ACTED_IN {roles:['Harry Burns']}]->(WhenHarryMetSally),\n", + "(MegR)-[:ACTED_IN {roles:['Sally Albright']}]->(WhenHarryMetSally),\n", + "(CarrieF)-[:ACTED_IN {roles:['Marie']}]->(WhenHarryMetSally),\n", + "(BrunoK)-[:ACTED_IN {roles:['Jess']}]->(WhenHarryMetSally),\n", + "(RobR)-[:DIRECTED]->(WhenHarryMetSally),\n", + "(RobR)-[:PRODUCED]->(WhenHarryMetSally),\n", + "(NoraE)-[:PRODUCED]->(WhenHarryMetSally),\n", + "(NoraE)-[:WROTE]->(WhenHarryMetSally)\n", + "\n", + "CREATE (ThatThingYouDo:Movie {title:'That Thing You Do', released:1996, tagline:'In every life there comes a time when that thing you dream becomes that thing you do'})\n", + "CREATE (LivT:Person {name:'Liv Tyler', born:1977})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Mr. White']}]->(ThatThingYouDo),\n", + "(LivT)-[:ACTED_IN {roles:['Faye Dolan']}]->(ThatThingYouDo),\n", + "(Charlize)-[:ACTED_IN {roles:['Tina']}]->(ThatThingYouDo),\n", + "(TomH)-[:DIRECTED]->(ThatThingYouDo)\n", + "\n", + "CREATE (TheReplacements:Movie {title:'The Replacements', released:2000, tagline:'Pain heals, Chicks dig scars... Glory lasts forever'})\n", + "CREATE (Brooke:Person {name:'Brooke Langton', born:1970})\n", + "CREATE (Gene:Person {name:'Gene Hackman', born:1930})\n", + "CREATE (Orlando:Person {name:'Orlando Jones', born:1968})\n", + "CREATE (Howard:Person {name:'Howard Deutch', born:1950})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Shane Falco']}]->(TheReplacements),\n", + "(Brooke)-[:ACTED_IN {roles:['Annabelle Farrell']}]->(TheReplacements),\n", + "(Gene)-[:ACTED_IN {roles:['Jimmy McGinty']}]->(TheReplacements),\n", + "(Orlando)-[:ACTED_IN {roles:['Clifford Franklin']}]->(TheReplacements),\n", + "(Howard)-[:DIRECTED]->(TheReplacements)\n", + "\n", + "CREATE (RescueDawn:Movie {title:'RescueDawn', released:2006, tagline:\"Based on the extraordinary true story of one man's fight for freedom\"})\n", + "CREATE (ChristianB:Person {name:'Christian Bale', born:1974})\n", + "CREATE (ZachG:Person {name:'Zach Grenier', born:1954})\n", + "CREATE\n", + "(MarshallB)-[:ACTED_IN {roles:['Admiral']}]->(RescueDawn),\n", + "(ChristianB)-[:ACTED_IN {roles:['Dieter Dengler']}]->(RescueDawn),\n", + "(ZachG)-[:ACTED_IN {roles:['Squad Leader']}]->(RescueDawn),\n", + "(SteveZ)-[:ACTED_IN {roles:['Duane']}]->(RescueDawn),\n", + "(WernerH)-[:DIRECTED]->(RescueDawn)\n", + "\n", + "CREATE (TheBirdcage:Movie {title:'The Birdcage', released:1996, tagline:'Come as you are'})\n", + "CREATE (MikeN:Person {name:'Mike Nichols', born:1931})\n", + "CREATE\n", + "(Robin)-[:ACTED_IN {roles:['Armand Goldman']}]->(TheBirdcage),\n", + "(Nathan)-[:ACTED_IN {roles:['Albert Goldman']}]->(TheBirdcage),\n", + "(Gene)-[:ACTED_IN {roles:['Sen. Kevin Keeley']}]->(TheBirdcage),\n", + "(MikeN)-[:DIRECTED]->(TheBirdcage)\n", + "\n", + "CREATE (Unforgiven:Movie {title:'Unforgiven', released:1992, tagline:\"It's a hell of a thing, killing a man\"})\n", + "CREATE (RichardH:Person {name:'Richard Harris', born:1930})\n", + "CREATE (ClintE:Person {name:'Clint Eastwood', born:1930})\n", + "CREATE\n", + "(RichardH)-[:ACTED_IN {roles:['English Bob']}]->(Unforgiven),\n", + "(ClintE)-[:ACTED_IN {roles:['Bill Munny']}]->(Unforgiven),\n", + "(Gene)-[:ACTED_IN {roles:['Little Bill Daggett']}]->(Unforgiven),\n", + "(ClintE)-[:DIRECTED]->(Unforgiven)\n", + "\n", + "CREATE (JohnnyMnemonic:Movie {title:'Johnny Mnemonic', released:1995, tagline:'The hottest data on earth. In the coolest head in town'})\n", + "CREATE (Takeshi:Person {name:'Takeshi Kitano', born:1947})\n", + "CREATE (Dina:Person {name:'Dina Meyer', born:1968})\n", + "CREATE (IceT:Person {name:'Ice-T', born:1958})\n", + "CREATE (RobertL:Person {name:'Robert Longo', born:1953})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Johnny Mnemonic']}]->(JohnnyMnemonic),\n", + "(Takeshi)-[:ACTED_IN {roles:['Takahashi']}]->(JohnnyMnemonic),\n", + "(Dina)-[:ACTED_IN {roles:['Jane']}]->(JohnnyMnemonic),\n", + "(IceT)-[:ACTED_IN {roles:['J-Bone']}]->(JohnnyMnemonic),\n", + "(RobertL)-[:DIRECTED]->(JohnnyMnemonic)\n", + "\n", + "CREATE (CloudAtlas:Movie {title:'Cloud Atlas', released:2012, tagline:'Everything is connected'})\n", + "CREATE (HalleB:Person {name:'Halle Berry', born:1966})\n", + "CREATE (JimB:Person {name:'Jim Broadbent', born:1949})\n", + "CREATE (TomT:Person {name:'Tom Tykwer', born:1965})\n", + "CREATE (DavidMitchell:Person {name:'David Mitchell', born:1969})\n", + "CREATE (StefanArndt:Person {name:'Stefan Arndt', born:1961})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Zachry', 'Dr. Henry Goose', 'Isaac Sachs', 'Dermot Hoggins']}]->(CloudAtlas),\n", + "(Hugo)-[:ACTED_IN {roles:['Bill Smoke', 'Haskell Moore', 'Tadeusz Kesselring', 'Nurse Noakes', 'Boardman Mephi', 'Old Georgie']}]->(CloudAtlas),\n", + "(HalleB)-[:ACTED_IN {roles:['Luisa Rey', 'Jocasta Ayrs', 'Ovid', 'Meronym']}]->(CloudAtlas),\n", + "(JimB)-[:ACTED_IN {roles:['Vyvyan Ayrs', 'Captain Molyneux', 'Timothy Cavendish']}]->(CloudAtlas),\n", + "(TomT)-[:DIRECTED]->(CloudAtlas),\n", + "(LillyW)-[:DIRECTED]->(CloudAtlas),\n", + "(LanaW)-[:DIRECTED]->(CloudAtlas),\n", + "(DavidMitchell)-[:WROTE]->(CloudAtlas),\n", + "(StefanArndt)-[:PRODUCED]->(CloudAtlas)\n", + "\n", + "CREATE (TheDaVinciCode:Movie {title:'The Da Vinci Code', released:2006, tagline:'Break The Codes'})\n", + "CREATE (IanM:Person {name:'Ian McKellen', born:1939})\n", + "CREATE (AudreyT:Person {name:'Audrey Tautou', born:1976})\n", + "CREATE (PaulB:Person {name:'Paul Bettany', born:1971})\n", + "CREATE (RonH:Person {name:'Ron Howard', born:1954})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Dr. Robert Langdon']}]->(TheDaVinciCode),\n", + "(IanM)-[:ACTED_IN {roles:['Sir Leight Teabing']}]->(TheDaVinciCode),\n", + "(AudreyT)-[:ACTED_IN {roles:['Sophie Neveu']}]->(TheDaVinciCode),\n", + "(PaulB)-[:ACTED_IN {roles:['Silas']}]->(TheDaVinciCode),\n", + "(RonH)-[:DIRECTED]->(TheDaVinciCode)\n", + "\n", + "CREATE (VforVendetta:Movie {title:'V for Vendetta', released:2006, tagline:'Freedom! Forever!'})\n", + "CREATE (NatalieP:Person {name:'Natalie Portman', born:1981})\n", + "CREATE (StephenR:Person {name:'Stephen Rea', born:1946})\n", + "CREATE (JohnH:Person {name:'John Hurt', born:1940})\n", + "CREATE (BenM:Person {name: 'Ben Miles', born:1967})\n", + "CREATE\n", + "(Hugo)-[:ACTED_IN {roles:['V']}]->(VforVendetta),\n", + "(NatalieP)-[:ACTED_IN {roles:['Evey Hammond']}]->(VforVendetta),\n", + "(StephenR)-[:ACTED_IN {roles:['Eric Finch']}]->(VforVendetta),\n", + "(JohnH)-[:ACTED_IN {roles:['High Chancellor Adam Sutler']}]->(VforVendetta),\n", + "(BenM)-[:ACTED_IN {roles:['Dascomb']}]->(VforVendetta),\n", + "(JamesM)-[:DIRECTED]->(VforVendetta),\n", + "(LillyW)-[:PRODUCED]->(VforVendetta),\n", + "(LanaW)-[:PRODUCED]->(VforVendetta),\n", + "(JoelS)-[:PRODUCED]->(VforVendetta),\n", + "(LillyW)-[:WROTE]->(VforVendetta),\n", + "(LanaW)-[:WROTE]->(VforVendetta)\n", + "\n", + "CREATE (SpeedRacer:Movie {title:'Speed Racer', released:2008, tagline:'Speed has no limits'})\n", + "CREATE (EmileH:Person {name:'Emile Hirsch', born:1985})\n", + "CREATE (JohnG:Person {name:'John Goodman', born:1960})\n", + "CREATE (SusanS:Person {name:'Susan Sarandon', born:1946})\n", + "CREATE (MatthewF:Person {name:'Matthew Fox', born:1966})\n", + "CREATE (ChristinaR:Person {name:'Christina Ricci', born:1980})\n", + "CREATE (Rain:Person {name:'Rain', born:1982})\n", + "CREATE\n", + "(EmileH)-[:ACTED_IN {roles:['Speed Racer']}]->(SpeedRacer),\n", + "(JohnG)-[:ACTED_IN {roles:['Pops']}]->(SpeedRacer),\n", + "(SusanS)-[:ACTED_IN {roles:['Mom']}]->(SpeedRacer),\n", + "(MatthewF)-[:ACTED_IN {roles:['Racer X']}]->(SpeedRacer),\n", + "(ChristinaR)-[:ACTED_IN {roles:['Trixie']}]->(SpeedRacer),\n", + "(Rain)-[:ACTED_IN {roles:['Taejo Togokahn']}]->(SpeedRacer),\n", + "(BenM)-[:ACTED_IN {roles:['Cass Jones']}]->(SpeedRacer),\n", + "(LillyW)-[:DIRECTED]->(SpeedRacer),\n", + "(LanaW)-[:DIRECTED]->(SpeedRacer),\n", + "(LillyW)-[:WROTE]->(SpeedRacer),\n", + "(LanaW)-[:WROTE]->(SpeedRacer),\n", + "(JoelS)-[:PRODUCED]->(SpeedRacer)\n", + "\n", + "CREATE (NinjaAssassin:Movie {title:'Ninja Assassin', released:2009, tagline:'Prepare to enter a secret world of assassins'})\n", + "CREATE (NaomieH:Person {name:'Naomie Harris'})\n", + "CREATE\n", + "(Rain)-[:ACTED_IN {roles:['Raizo']}]->(NinjaAssassin),\n", + "(NaomieH)-[:ACTED_IN {roles:['Mika Coretti']}]->(NinjaAssassin),\n", + "(RickY)-[:ACTED_IN {roles:['Takeshi']}]->(NinjaAssassin),\n", + "(BenM)-[:ACTED_IN {roles:['Ryan Maslow']}]->(NinjaAssassin),\n", + "(JamesM)-[:DIRECTED]->(NinjaAssassin),\n", + "(LillyW)-[:PRODUCED]->(NinjaAssassin),\n", + "(LanaW)-[:PRODUCED]->(NinjaAssassin),\n", + "(JoelS)-[:PRODUCED]->(NinjaAssassin)\n", + "\n", + "CREATE (TheGreenMile:Movie {title:'The Green Mile', released:1999, tagline:\"Walk a mile you'll never forget.\"})\n", + "CREATE (MichaelD:Person {name:'Michael Clarke Duncan', born:1957})\n", + "CREATE (DavidM:Person {name:'David Morse', born:1953})\n", + "CREATE (SamR:Person {name:'Sam Rockwell', born:1968})\n", + "CREATE (GaryS:Person {name:'Gary Sinise', born:1955})\n", + "CREATE (PatriciaC:Person {name:'Patricia Clarkson', born:1959})\n", + "CREATE (FrankD:Person {name:'Frank Darabont', born:1959})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Paul Edgecomb']}]->(TheGreenMile),\n", + "(MichaelD)-[:ACTED_IN {roles:['John Coffey']}]->(TheGreenMile),\n", + "(DavidM)-[:ACTED_IN {roles:['Brutus \"Brutal\" Howell']}]->(TheGreenMile),\n", + "(BonnieH)-[:ACTED_IN {roles:['Jan Edgecomb']}]->(TheGreenMile),\n", + "(JamesC)-[:ACTED_IN {roles:['Warden Hal Moores']}]->(TheGreenMile),\n", + "(SamR)-[:ACTED_IN {roles:['\"Wild Bill\" Wharton']}]->(TheGreenMile),\n", + "(GaryS)-[:ACTED_IN {roles:['Burt Hammersmith']}]->(TheGreenMile),\n", + "(PatriciaC)-[:ACTED_IN {roles:['Melinda Moores']}]->(TheGreenMile),\n", + "(FrankD)-[:DIRECTED]->(TheGreenMile)\n", + "\n", + "CREATE (FrostNixon:Movie {title:'Frost/Nixon', released:2008, tagline:'400 million people were waiting for the truth.'})\n", + "CREATE (FrankL:Person {name:'Frank Langella', born:1938})\n", + "CREATE (MichaelS:Person {name:'Michael Sheen', born:1969})\n", + "CREATE (OliverP:Person {name:'Oliver Platt', born:1960})\n", + "CREATE\n", + "(FrankL)-[:ACTED_IN {roles:['Richard Nixon']}]->(FrostNixon),\n", + "(MichaelS)-[:ACTED_IN {roles:['David Frost']}]->(FrostNixon),\n", + "(KevinB)-[:ACTED_IN {roles:['Jack Brennan']}]->(FrostNixon),\n", + "(OliverP)-[:ACTED_IN {roles:['Bob Zelnick']}]->(FrostNixon),\n", + "(SamR)-[:ACTED_IN {roles:['James Reston, Jr.']}]->(FrostNixon),\n", + "(RonH)-[:DIRECTED]->(FrostNixon)\n", + "\n", + "CREATE (Hoffa:Movie {title:'Hoffa', released:1992, tagline:\"He didn't want law. He wanted justice.\"})\n", + "CREATE (DannyD:Person {name:'Danny DeVito', born:1944})\n", + "CREATE (JohnR:Person {name:'John C. Reilly', born:1965})\n", + "CREATE\n", + "(JackN)-[:ACTED_IN {roles:['Hoffa']}]->(Hoffa),\n", + "(DannyD)-[:ACTED_IN {roles:['Robert \"Bobby\" Ciaro']}]->(Hoffa),\n", + "(JTW)-[:ACTED_IN {roles:['Frank Fitzsimmons']}]->(Hoffa),\n", + "(JohnR)-[:ACTED_IN {roles:['Peter \"Pete\" Connelly']}]->(Hoffa),\n", + "(DannyD)-[:DIRECTED]->(Hoffa)\n", + "\n", + "CREATE (Apollo13:Movie {title:'Apollo 13', released:1995, tagline:'Houston, we have a problem.'})\n", + "CREATE (EdH:Person {name:'Ed Harris', born:1950})\n", + "CREATE (BillPax:Person {name:'Bill Paxton', born:1955})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Jim Lovell']}]->(Apollo13),\n", + "(KevinB)-[:ACTED_IN {roles:['Jack Swigert']}]->(Apollo13),\n", + "(EdH)-[:ACTED_IN {roles:['Gene Kranz']}]->(Apollo13),\n", + "(BillPax)-[:ACTED_IN {roles:['Fred Haise']}]->(Apollo13),\n", + "(GaryS)-[:ACTED_IN {roles:['Ken Mattingly']}]->(Apollo13),\n", + "(RonH)-[:DIRECTED]->(Apollo13)\n", + "\n", + "CREATE (Twister:Movie {title:'Twister', released:1996, tagline:\"Don't Breathe. Don't Look Back.\"})\n", + "CREATE (PhilipH:Person {name:'Philip Seymour Hoffman', born:1967})\n", + "CREATE (JanB:Person {name:'Jan de Bont', born:1943})\n", + "CREATE\n", + "(BillPax)-[:ACTED_IN {roles:['Bill Harding']}]->(Twister),\n", + "(HelenH)-[:ACTED_IN {roles:['Dr. Jo Harding']}]->(Twister),\n", + "(ZachG)-[:ACTED_IN {roles:['Eddie']}]->(Twister),\n", + "(PhilipH)-[:ACTED_IN {roles:['Dustin \"Dusty\" Davis']}]->(Twister),\n", + "(JanB)-[:DIRECTED]->(Twister)\n", + "\n", + "CREATE (CastAway:Movie {title:'Cast Away', released:2000, tagline:'At the edge of the world, his journey begins.'})\n", + "CREATE (RobertZ:Person {name:'Robert Zemeckis', born:1951})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Chuck Noland']}]->(CastAway),\n", + "(HelenH)-[:ACTED_IN {roles:['Kelly Frears']}]->(CastAway),\n", + "(RobertZ)-[:DIRECTED]->(CastAway)\n", + "\n", + "CREATE (OneFlewOvertheCuckoosNest:Movie {title:\"One Flew Over the Cuckoo's Nest\", released:1975, tagline:\"If he's crazy, what does that make you?\"})\n", + "CREATE (MilosF:Person {name:'Milos Forman', born:1932})\n", + "CREATE\n", + "(JackN)-[:ACTED_IN {roles:['Randle McMurphy']}]->(OneFlewOvertheCuckoosNest),\n", + "(DannyD)-[:ACTED_IN {roles:['Martini']}]->(OneFlewOvertheCuckoosNest),\n", + "(MilosF)-[:DIRECTED]->(OneFlewOvertheCuckoosNest)\n", + "\n", + "CREATE (SomethingsGottaGive:Movie {title:\"Something's Gotta Give\", released:2003})\n", + "CREATE (DianeK:Person {name:'Diane Keaton', born:1946})\n", + "CREATE (NancyM:Person {name:'Nancy Meyers', born:1949})\n", + "CREATE\n", + "(JackN)-[:ACTED_IN {roles:['Harry Sanborn']}]->(SomethingsGottaGive),\n", + "(DianeK)-[:ACTED_IN {roles:['Erica Barry']}]->(SomethingsGottaGive),\n", + "(Keanu)-[:ACTED_IN {roles:['Julian Mercer']}]->(SomethingsGottaGive),\n", + "(NancyM)-[:DIRECTED]->(SomethingsGottaGive),\n", + "(NancyM)-[:PRODUCED]->(SomethingsGottaGive),\n", + "(NancyM)-[:WROTE]->(SomethingsGottaGive)\n", + "\n", + "CREATE (BicentennialMan:Movie {title:'Bicentennial Man', released:1999, tagline:\"One robot's 200 year journey to become an ordinary man.\"})\n", + "CREATE (ChrisC:Person {name:'Chris Columbus', born:1958})\n", + "CREATE\n", + "(Robin)-[:ACTED_IN {roles:['Andrew Marin']}]->(BicentennialMan),\n", + "(OliverP)-[:ACTED_IN {roles:['Rupert Burns']}]->(BicentennialMan),\n", + "(ChrisC)-[:DIRECTED]->(BicentennialMan)\n", + "\n", + "CREATE (CharlieWilsonsWar:Movie {title:\"Charlie Wilson's War\", released:2007, tagline:\"A stiff drink. A little mascara. A lot of nerve. Who said they couldn't bring down the Soviet empire.\"})\n", + "CREATE (JuliaR:Person {name:'Julia Roberts', born:1967})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Rep. Charlie Wilson']}]->(CharlieWilsonsWar),\n", + "(JuliaR)-[:ACTED_IN {roles:['Joanne Herring']}]->(CharlieWilsonsWar),\n", + "(PhilipH)-[:ACTED_IN {roles:['Gust Avrakotos']}]->(CharlieWilsonsWar),\n", + "(MikeN)-[:DIRECTED]->(CharlieWilsonsWar)\n", + "\n", + "CREATE (ThePolarExpress:Movie {title:'The Polar Express', released:2004, tagline:'This Holiday Season... Believe'})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Hero Boy', 'Father', 'Conductor', 'Hobo', 'Scrooge', 'Santa Claus']}]->(ThePolarExpress),\n", + "(RobertZ)-[:DIRECTED]->(ThePolarExpress)\n", + "\n", + "CREATE (ALeagueofTheirOwn:Movie {title:'A League of Their Own', released:1992, tagline:'Once in a lifetime you get a chance to do something different.'})\n", + "CREATE (Madonna:Person {name:'Madonna', born:1954})\n", + "CREATE (GeenaD:Person {name:'Geena Davis', born:1956})\n", + "CREATE (LoriP:Person {name:'Lori Petty', born:1963})\n", + "CREATE (PennyM:Person {name:'Penny Marshall', born:1943})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Jimmy Dugan']}]->(ALeagueofTheirOwn),\n", + "(GeenaD)-[:ACTED_IN {roles:['Dottie Hinson']}]->(ALeagueofTheirOwn),\n", + "(LoriP)-[:ACTED_IN {roles:['Kit Keller']}]->(ALeagueofTheirOwn),\n", + "(RosieO)-[:ACTED_IN {roles:['Doris Murphy']}]->(ALeagueofTheirOwn),\n", + "(Madonna)-[:ACTED_IN {roles:['\"All the Way\" Mae Mordabito']}]->(ALeagueofTheirOwn),\n", + "(BillPax)-[:ACTED_IN {roles:['Bob Hinson']}]->(ALeagueofTheirOwn),\n", + "(PennyM)-[:DIRECTED]->(ALeagueofTheirOwn)\n", + "\n", + "CREATE (PaulBlythe:Person {name:'Paul Blythe'})\n", + "CREATE (AngelaScope:Person {name:'Angela Scope'})\n", + "CREATE (JessicaThompson:Person {name:'Jessica Thompson'})\n", + "CREATE (JamesThompson:Person {name:'James Thompson'})\n", + "\n", + "CREATE\n", + "(JamesThompson)-[:FOLLOWS]->(JessicaThompson),\n", + "(AngelaScope)-[:FOLLOWS]->(JessicaThompson),\n", + "(PaulBlythe)-[:FOLLOWS]->(AngelaScope)\n", + "\n", + "CREATE\n", + "(JessicaThompson)-[:REVIEWED {summary:'An amazing journey', rating:95}]->(CloudAtlas),\n", + "(JessicaThompson)-[:REVIEWED {summary:'Silly, but fun', rating:65}]->(TheReplacements),\n", + "(JamesThompson)-[:REVIEWED {summary:'The coolest football movie ever', rating:100}]->(TheReplacements),\n", + "(AngelaScope)-[:REVIEWED {summary:'Pretty funny at times', rating:62}]->(TheReplacements),\n", + "(JessicaThompson)-[:REVIEWED {summary:'Dark, but compelling', rating:85}]->(Unforgiven),\n", + "(JessicaThompson)-[:REVIEWED {summary:\"Slapstick redeemed only by the Robin Williams and Gene Hackman's stellar performances\", rating:45}]->(TheBirdcage),\n", + "(JessicaThompson)-[:REVIEWED {summary:'A solid romp', rating:68}]->(TheDaVinciCode),\n", + "(JamesThompson)-[:REVIEWED {summary:'Fun, but a little far fetched', rating:65}]->(TheDaVinciCode),\n", + "(JessicaThompson)-[:REVIEWED {summary:'You had me at Jerry', rating:92}]->(JerryMaguire)\n", + "\n", + "WITH TomH as a\n", + "MATCH (a)-[:ACTED_IN]->(m)<-[:DIRECTED]-(d) RETURN a,m,d LIMIT 10;\n", + "```\n", + "\n", + "
\n", + "\n", + "This will create the following graph model\n", + "\n", + "" + ], + "metadata": { + "id": "AQhqv93Mj0Ss" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher -u $neo4j_url -us $neo4j_user -pw $neo4j_password\n", + "// the following Cypher query is the same as above\n", + "// and is required for running the notebook\n", + "CREATE (TheMatrix:Movie {title:'The Matrix', released:1999, tagline:'Welcome to the Real World'})\n", + "CREATE (Keanu:Person {name:'Keanu Reeves', born:1964})\n", + "CREATE (Carrie:Person {name:'Carrie-Anne Moss', born:1967})\n", + "CREATE (Laurence:Person {name:'Laurence Fishburne', born:1961})\n", + "CREATE (Hugo:Person {name:'Hugo Weaving', born:1960})\n", + "CREATE (LillyW:Person {name:'Lilly Wachowski', born:1967})\n", + "CREATE (LanaW:Person {name:'Lana Wachowski', born:1965})\n", + "CREATE (JoelS:Person {name:'Joel Silver', born:1952})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrix),\n", + "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrix),\n", + "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrix),\n", + "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrix),\n", + "(LillyW)-[:DIRECTED]->(TheMatrix),\n", + "(LanaW)-[:DIRECTED]->(TheMatrix),\n", + "(JoelS)-[:PRODUCED]->(TheMatrix)\n", + "\n", + "CREATE (Emil:Person {name:\"Emil Eifrem\", born:1978})\n", + "CREATE (Emil)-[:ACTED_IN {roles:[\"Emil\"]}]->(TheMatrix)\n", + "\n", + "CREATE (TheMatrixReloaded:Movie {title:'The Matrix Reloaded', released:2003, tagline:'Free your mind'})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrixReloaded),\n", + "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrixReloaded),\n", + "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrixReloaded),\n", + "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrixReloaded),\n", + "(LillyW)-[:DIRECTED]->(TheMatrixReloaded),\n", + "(LanaW)-[:DIRECTED]->(TheMatrixReloaded),\n", + "(JoelS)-[:PRODUCED]->(TheMatrixReloaded)\n", + "\n", + "CREATE (TheMatrixRevolutions:Movie {title:'The Matrix Revolutions', released:2003, tagline:'Everything that has a beginning has an end'})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Neo']}]->(TheMatrixRevolutions),\n", + "(Carrie)-[:ACTED_IN {roles:['Trinity']}]->(TheMatrixRevolutions),\n", + "(Laurence)-[:ACTED_IN {roles:['Morpheus']}]->(TheMatrixRevolutions),\n", + "(Hugo)-[:ACTED_IN {roles:['Agent Smith']}]->(TheMatrixRevolutions),\n", + "(LillyW)-[:DIRECTED]->(TheMatrixRevolutions),\n", + "(LanaW)-[:DIRECTED]->(TheMatrixRevolutions),\n", + "(JoelS)-[:PRODUCED]->(TheMatrixRevolutions)\n", + "\n", + "CREATE (TheDevilsAdvocate:Movie {title:\"The Devil's Advocate\", released:1997, tagline:'Evil has its winning ways'})\n", + "CREATE (Charlize:Person {name:'Charlize Theron', born:1975})\n", + "CREATE (Al:Person {name:'Al Pacino', born:1940})\n", + "CREATE (Taylor:Person {name:'Taylor Hackford', born:1944})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Kevin Lomax']}]->(TheDevilsAdvocate),\n", + "(Charlize)-[:ACTED_IN {roles:['Mary Ann Lomax']}]->(TheDevilsAdvocate),\n", + "(Al)-[:ACTED_IN {roles:['John Milton']}]->(TheDevilsAdvocate),\n", + "(Taylor)-[:DIRECTED]->(TheDevilsAdvocate)\n", + "\n", + "CREATE (AFewGoodMen:Movie {title:\"A Few Good Men\", released:1992, tagline:\"In the heart of the nation's capital, in a courthouse of the U.S. government, one man will stop at nothing to keep his honor, and one will stop at nothing to find the truth.\"})\n", + "CREATE (TomC:Person {name:'Tom Cruise', born:1962})\n", + "CREATE (JackN:Person {name:'Jack Nicholson', born:1937})\n", + "CREATE (DemiM:Person {name:'Demi Moore', born:1962})\n", + "CREATE (KevinB:Person {name:'Kevin Bacon', born:1958})\n", + "CREATE (KieferS:Person {name:'Kiefer Sutherland', born:1966})\n", + "CREATE (NoahW:Person {name:'Noah Wyle', born:1971})\n", + "CREATE (CubaG:Person {name:'Cuba Gooding Jr.', born:1968})\n", + "CREATE (KevinP:Person {name:'Kevin Pollak', born:1957})\n", + "CREATE (JTW:Person {name:'J.T. Walsh', born:1943})\n", + "CREATE (JamesM:Person {name:'James Marshall', born:1967})\n", + "CREATE (ChristopherG:Person {name:'Christopher Guest', born:1948})\n", + "CREATE (RobR:Person {name:'Rob Reiner', born:1947})\n", + "CREATE (AaronS:Person {name:'Aaron Sorkin', born:1961})\n", + "CREATE\n", + "(TomC)-[:ACTED_IN {roles:['Lt. Daniel Kaffee']}]->(AFewGoodMen),\n", + "(JackN)-[:ACTED_IN {roles:['Col. Nathan R. Jessup']}]->(AFewGoodMen),\n", + "(DemiM)-[:ACTED_IN {roles:['Lt. Cdr. JoAnne Galloway']}]->(AFewGoodMen),\n", + "(KevinB)-[:ACTED_IN {roles:['Capt. Jack Ross']}]->(AFewGoodMen),\n", + "(KieferS)-[:ACTED_IN {roles:['Lt. Jonathan Kendrick']}]->(AFewGoodMen),\n", + "(NoahW)-[:ACTED_IN {roles:['Cpl. Jeffrey Barnes']}]->(AFewGoodMen),\n", + "(CubaG)-[:ACTED_IN {roles:['Cpl. Carl Hammaker']}]->(AFewGoodMen),\n", + "(KevinP)-[:ACTED_IN {roles:['Lt. Sam Weinberg']}]->(AFewGoodMen),\n", + "(JTW)-[:ACTED_IN {roles:['Lt. Col. Matthew Andrew Markinson']}]->(AFewGoodMen),\n", + "(JamesM)-[:ACTED_IN {roles:['Pfc. Louden Downey']}]->(AFewGoodMen),\n", + "(ChristopherG)-[:ACTED_IN {roles:['Dr. Stone']}]->(AFewGoodMen),\n", + "(AaronS)-[:ACTED_IN {roles:['Man in Bar']}]->(AFewGoodMen),\n", + "(RobR)-[:DIRECTED]->(AFewGoodMen),\n", + "(AaronS)-[:WROTE]->(AFewGoodMen)\n", + "\n", + "CREATE (TopGun:Movie {title:\"Top Gun\", released:1986, tagline:'I feel the need, the need for speed.'})\n", + "CREATE (KellyM:Person {name:'Kelly McGillis', born:1957})\n", + "CREATE (ValK:Person {name:'Val Kilmer', born:1959})\n", + "CREATE (AnthonyE:Person {name:'Anthony Edwards', born:1962})\n", + "CREATE (TomS:Person {name:'Tom Skerritt', born:1933})\n", + "CREATE (MegR:Person {name:'Meg Ryan', born:1961})\n", + "CREATE (TonyS:Person {name:'Tony Scott', born:1944})\n", + "CREATE (JimC:Person {name:'Jim Cash', born:1941})\n", + "CREATE\n", + "(TomC)-[:ACTED_IN {roles:['Maverick']}]->(TopGun),\n", + "(KellyM)-[:ACTED_IN {roles:['Charlie']}]->(TopGun),\n", + "(ValK)-[:ACTED_IN {roles:['Iceman']}]->(TopGun),\n", + "(AnthonyE)-[:ACTED_IN {roles:['Goose']}]->(TopGun),\n", + "(TomS)-[:ACTED_IN {roles:['Viper']}]->(TopGun),\n", + "(MegR)-[:ACTED_IN {roles:['Carole']}]->(TopGun),\n", + "(TonyS)-[:DIRECTED]->(TopGun),\n", + "(JimC)-[:WROTE]->(TopGun)\n", + "\n", + "CREATE (JerryMaguire:Movie {title:'Jerry Maguire', released:2000, tagline:'The rest of his life begins now.'})\n", + "CREATE (ReneeZ:Person {name:'Renee Zellweger', born:1969})\n", + "CREATE (KellyP:Person {name:'Kelly Preston', born:1962})\n", + "CREATE (JerryO:Person {name:\"Jerry O'Connell\", born:1974})\n", + "CREATE (JayM:Person {name:'Jay Mohr', born:1970})\n", + "CREATE (BonnieH:Person {name:'Bonnie Hunt', born:1961})\n", + "CREATE (ReginaK:Person {name:'Regina King', born:1971})\n", + "CREATE (JonathanL:Person {name:'Jonathan Lipnicki', born:1996})\n", + "CREATE (CameronC:Person {name:'Cameron Crowe', born:1957})\n", + "CREATE\n", + "(TomC)-[:ACTED_IN {roles:['Jerry Maguire']}]->(JerryMaguire),\n", + "(CubaG)-[:ACTED_IN {roles:['Rod Tidwell']}]->(JerryMaguire),\n", + "(ReneeZ)-[:ACTED_IN {roles:['Dorothy Boyd']}]->(JerryMaguire),\n", + "(KellyP)-[:ACTED_IN {roles:['Avery Bishop']}]->(JerryMaguire),\n", + "(JerryO)-[:ACTED_IN {roles:['Frank Cushman']}]->(JerryMaguire),\n", + "(JayM)-[:ACTED_IN {roles:['Bob Sugar']}]->(JerryMaguire),\n", + "(BonnieH)-[:ACTED_IN {roles:['Laurel Boyd']}]->(JerryMaguire),\n", + "(ReginaK)-[:ACTED_IN {roles:['Marcee Tidwell']}]->(JerryMaguire),\n", + "(JonathanL)-[:ACTED_IN {roles:['Ray Boyd']}]->(JerryMaguire),\n", + "(CameronC)-[:DIRECTED]->(JerryMaguire),\n", + "(CameronC)-[:PRODUCED]->(JerryMaguire),\n", + "(CameronC)-[:WROTE]->(JerryMaguire)\n", + "\n", + "CREATE (StandByMe:Movie {title:\"Stand By Me\", released:1986, tagline:\"For some, it's the last real taste of innocence, and the first real taste of life. But for everyone, it's the time that memories are made of.\"})\n", + "CREATE (RiverP:Person {name:'River Phoenix', born:1970})\n", + "CREATE (CoreyF:Person {name:'Corey Feldman', born:1971})\n", + "CREATE (WilW:Person {name:'Wil Wheaton', born:1972})\n", + "CREATE (JohnC:Person {name:'John Cusack', born:1966})\n", + "CREATE (MarshallB:Person {name:'Marshall Bell', born:1942})\n", + "CREATE\n", + "(WilW)-[:ACTED_IN {roles:['Gordie Lachance']}]->(StandByMe),\n", + "(RiverP)-[:ACTED_IN {roles:['Chris Chambers']}]->(StandByMe),\n", + "(JerryO)-[:ACTED_IN {roles:['Vern Tessio']}]->(StandByMe),\n", + "(CoreyF)-[:ACTED_IN {roles:['Teddy Duchamp']}]->(StandByMe),\n", + "(JohnC)-[:ACTED_IN {roles:['Denny Lachance']}]->(StandByMe),\n", + "(KieferS)-[:ACTED_IN {roles:['Ace Merrill']}]->(StandByMe),\n", + "(MarshallB)-[:ACTED_IN {roles:['Mr. Lachance']}]->(StandByMe),\n", + "(RobR)-[:DIRECTED]->(StandByMe)\n", + "\n", + "CREATE (AsGoodAsItGets:Movie {title:'As Good as It Gets', released:1997, tagline:'A comedy from the heart that goes for the throat.'})\n", + "CREATE (HelenH:Person {name:'Helen Hunt', born:1963})\n", + "CREATE (GregK:Person {name:'Greg Kinnear', born:1963})\n", + "CREATE (JamesB:Person {name:'James L. Brooks', born:1940})\n", + "CREATE\n", + "(JackN)-[:ACTED_IN {roles:['Melvin Udall']}]->(AsGoodAsItGets),\n", + "(HelenH)-[:ACTED_IN {roles:['Carol Connelly']}]->(AsGoodAsItGets),\n", + "(GregK)-[:ACTED_IN {roles:['Simon Bishop']}]->(AsGoodAsItGets),\n", + "(CubaG)-[:ACTED_IN {roles:['Frank Sachs']}]->(AsGoodAsItGets),\n", + "(JamesB)-[:DIRECTED]->(AsGoodAsItGets)\n", + "\n", + "CREATE (WhatDreamsMayCome:Movie {title:'What Dreams May Come', released:1998, tagline:'After life there is more. The end is just the beginning.'})\n", + "CREATE (AnnabellaS:Person {name:'Annabella Sciorra', born:1960})\n", + "CREATE (MaxS:Person {name:'Max von Sydow', born:1929})\n", + "CREATE (WernerH:Person {name:'Werner Herzog', born:1942})\n", + "CREATE (Robin:Person {name:'Robin Williams', born:1951})\n", + "CREATE (VincentW:Person {name:'Vincent Ward', born:1956})\n", + "CREATE\n", + "(Robin)-[:ACTED_IN {roles:['Chris Nielsen']}]->(WhatDreamsMayCome),\n", + "(CubaG)-[:ACTED_IN {roles:['Albert Lewis']}]->(WhatDreamsMayCome),\n", + "(AnnabellaS)-[:ACTED_IN {roles:['Annie Collins-Nielsen']}]->(WhatDreamsMayCome),\n", + "(MaxS)-[:ACTED_IN {roles:['The Tracker']}]->(WhatDreamsMayCome),\n", + "(WernerH)-[:ACTED_IN {roles:['The Face']}]->(WhatDreamsMayCome),\n", + "(VincentW)-[:DIRECTED]->(WhatDreamsMayCome)\n", + "\n", + "CREATE (SnowFallingonCedars:Movie {title:'Snow Falling on Cedars', released:1999, tagline:'First loves last. Forever.'})\n", + "CREATE (EthanH:Person {name:'Ethan Hawke', born:1970})\n", + "CREATE (RickY:Person {name:'Rick Yune', born:1971})\n", + "CREATE (JamesC:Person {name:'James Cromwell', born:1940})\n", + "CREATE (ScottH:Person {name:'Scott Hicks', born:1953})\n", + "CREATE\n", + "(EthanH)-[:ACTED_IN {roles:['Ishmael Chambers']}]->(SnowFallingonCedars),\n", + "(RickY)-[:ACTED_IN {roles:['Kazuo Miyamoto']}]->(SnowFallingonCedars),\n", + "(MaxS)-[:ACTED_IN {roles:['Nels Gudmundsson']}]->(SnowFallingonCedars),\n", + "(JamesC)-[:ACTED_IN {roles:['Judge Fielding']}]->(SnowFallingonCedars),\n", + "(ScottH)-[:DIRECTED]->(SnowFallingonCedars)\n", + "\n", + "CREATE (YouveGotMail:Movie {title:\"You've Got Mail\", released:1998, tagline:'At odds in life... in love on-line.'})\n", + "CREATE (ParkerP:Person {name:'Parker Posey', born:1968})\n", + "CREATE (DaveC:Person {name:'Dave Chappelle', born:1973})\n", + "CREATE (SteveZ:Person {name:'Steve Zahn', born:1967})\n", + "CREATE (TomH:Person {name:'Tom Hanks', born:1956})\n", + "CREATE (NoraE:Person {name:'Nora Ephron', born:1941})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Joe Fox']}]->(YouveGotMail),\n", + "(MegR)-[:ACTED_IN {roles:['Kathleen Kelly']}]->(YouveGotMail),\n", + "(GregK)-[:ACTED_IN {roles:['Frank Navasky']}]->(YouveGotMail),\n", + "(ParkerP)-[:ACTED_IN {roles:['Patricia Eden']}]->(YouveGotMail),\n", + "(DaveC)-[:ACTED_IN {roles:['Kevin Jackson']}]->(YouveGotMail),\n", + "(SteveZ)-[:ACTED_IN {roles:['George Pappas']}]->(YouveGotMail),\n", + "(NoraE)-[:DIRECTED]->(YouveGotMail)\n", + "\n", + "CREATE (SleeplessInSeattle:Movie {title:'Sleepless in Seattle', released:1993, tagline:'What if someone you never met, someone you never saw, someone you never knew was the only someone for you?'})\n", + "CREATE (RitaW:Person {name:'Rita Wilson', born:1956})\n", + "CREATE (BillPull:Person {name:'Bill Pullman', born:1953})\n", + "CREATE (VictorG:Person {name:'Victor Garber', born:1949})\n", + "CREATE (RosieO:Person {name:\"Rosie O'Donnell\", born:1962})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Sam Baldwin']}]->(SleeplessInSeattle),\n", + "(MegR)-[:ACTED_IN {roles:['Annie Reed']}]->(SleeplessInSeattle),\n", + "(RitaW)-[:ACTED_IN {roles:['Suzy']}]->(SleeplessInSeattle),\n", + "(BillPull)-[:ACTED_IN {roles:['Walter']}]->(SleeplessInSeattle),\n", + "(VictorG)-[:ACTED_IN {roles:['Greg']}]->(SleeplessInSeattle),\n", + "(RosieO)-[:ACTED_IN {roles:['Becky']}]->(SleeplessInSeattle),\n", + "(NoraE)-[:DIRECTED]->(SleeplessInSeattle)\n", + "\n", + "CREATE (JoeVersustheVolcano:Movie {title:'Joe Versus the Volcano', released:1990, tagline:'A story of love, lava and burning desire.'})\n", + "CREATE (JohnS:Person {name:'John Patrick Stanley', born:1950})\n", + "CREATE (Nathan:Person {name:'Nathan Lane', born:1956})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Joe Banks']}]->(JoeVersustheVolcano),\n", + "(MegR)-[:ACTED_IN {roles:['DeDe', 'Angelica Graynamore', 'Patricia Graynamore']}]->(JoeVersustheVolcano),\n", + "(Nathan)-[:ACTED_IN {roles:['Baw']}]->(JoeVersustheVolcano),\n", + "(JohnS)-[:DIRECTED]->(JoeVersustheVolcano)\n", + "\n", + "CREATE (WhenHarryMetSally:Movie {title:'When Harry Met Sally', released:1998, tagline:'Can two friends sleep together and still love each other in the morning?'})\n", + "CREATE (BillyC:Person {name:'Billy Crystal', born:1948})\n", + "CREATE (CarrieF:Person {name:'Carrie Fisher', born:1956})\n", + "CREATE (BrunoK:Person {name:'Bruno Kirby', born:1949})\n", + "CREATE\n", + "(BillyC)-[:ACTED_IN {roles:['Harry Burns']}]->(WhenHarryMetSally),\n", + "(MegR)-[:ACTED_IN {roles:['Sally Albright']}]->(WhenHarryMetSally),\n", + "(CarrieF)-[:ACTED_IN {roles:['Marie']}]->(WhenHarryMetSally),\n", + "(BrunoK)-[:ACTED_IN {roles:['Jess']}]->(WhenHarryMetSally),\n", + "(RobR)-[:DIRECTED]->(WhenHarryMetSally),\n", + "(RobR)-[:PRODUCED]->(WhenHarryMetSally),\n", + "(NoraE)-[:PRODUCED]->(WhenHarryMetSally),\n", + "(NoraE)-[:WROTE]->(WhenHarryMetSally)\n", + "\n", + "CREATE (ThatThingYouDo:Movie {title:'That Thing You Do', released:1996, tagline:'In every life there comes a time when that thing you dream becomes that thing you do'})\n", + "CREATE (LivT:Person {name:'Liv Tyler', born:1977})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Mr. White']}]->(ThatThingYouDo),\n", + "(LivT)-[:ACTED_IN {roles:['Faye Dolan']}]->(ThatThingYouDo),\n", + "(Charlize)-[:ACTED_IN {roles:['Tina']}]->(ThatThingYouDo),\n", + "(TomH)-[:DIRECTED]->(ThatThingYouDo)\n", + "\n", + "CREATE (TheReplacements:Movie {title:'The Replacements', released:2000, tagline:'Pain heals, Chicks dig scars... Glory lasts forever'})\n", + "CREATE (Brooke:Person {name:'Brooke Langton', born:1970})\n", + "CREATE (Gene:Person {name:'Gene Hackman', born:1930})\n", + "CREATE (Orlando:Person {name:'Orlando Jones', born:1968})\n", + "CREATE (Howard:Person {name:'Howard Deutch', born:1950})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Shane Falco']}]->(TheReplacements),\n", + "(Brooke)-[:ACTED_IN {roles:['Annabelle Farrell']}]->(TheReplacements),\n", + "(Gene)-[:ACTED_IN {roles:['Jimmy McGinty']}]->(TheReplacements),\n", + "(Orlando)-[:ACTED_IN {roles:['Clifford Franklin']}]->(TheReplacements),\n", + "(Howard)-[:DIRECTED]->(TheReplacements)\n", + "\n", + "CREATE (RescueDawn:Movie {title:'RescueDawn', released:2006, tagline:\"Based on the extraordinary true story of one man's fight for freedom\"})\n", + "CREATE (ChristianB:Person {name:'Christian Bale', born:1974})\n", + "CREATE (ZachG:Person {name:'Zach Grenier', born:1954})\n", + "CREATE\n", + "(MarshallB)-[:ACTED_IN {roles:['Admiral']}]->(RescueDawn),\n", + "(ChristianB)-[:ACTED_IN {roles:['Dieter Dengler']}]->(RescueDawn),\n", + "(ZachG)-[:ACTED_IN {roles:['Squad Leader']}]->(RescueDawn),\n", + "(SteveZ)-[:ACTED_IN {roles:['Duane']}]->(RescueDawn),\n", + "(WernerH)-[:DIRECTED]->(RescueDawn)\n", + "\n", + "CREATE (TheBirdcage:Movie {title:'The Birdcage', released:1996, tagline:'Come as you are'})\n", + "CREATE (MikeN:Person {name:'Mike Nichols', born:1931})\n", + "CREATE\n", + "(Robin)-[:ACTED_IN {roles:['Armand Goldman']}]->(TheBirdcage),\n", + "(Nathan)-[:ACTED_IN {roles:['Albert Goldman']}]->(TheBirdcage),\n", + "(Gene)-[:ACTED_IN {roles:['Sen. Kevin Keeley']}]->(TheBirdcage),\n", + "(MikeN)-[:DIRECTED]->(TheBirdcage)\n", + "\n", + "CREATE (Unforgiven:Movie {title:'Unforgiven', released:1992, tagline:\"It's a hell of a thing, killing a man\"})\n", + "CREATE (RichardH:Person {name:'Richard Harris', born:1930})\n", + "CREATE (ClintE:Person {name:'Clint Eastwood', born:1930})\n", + "CREATE\n", + "(RichardH)-[:ACTED_IN {roles:['English Bob']}]->(Unforgiven),\n", + "(ClintE)-[:ACTED_IN {roles:['Bill Munny']}]->(Unforgiven),\n", + "(Gene)-[:ACTED_IN {roles:['Little Bill Daggett']}]->(Unforgiven),\n", + "(ClintE)-[:DIRECTED]->(Unforgiven)\n", + "\n", + "CREATE (JohnnyMnemonic:Movie {title:'Johnny Mnemonic', released:1995, tagline:'The hottest data on earth. In the coolest head in town'})\n", + "CREATE (Takeshi:Person {name:'Takeshi Kitano', born:1947})\n", + "CREATE (Dina:Person {name:'Dina Meyer', born:1968})\n", + "CREATE (IceT:Person {name:'Ice-T', born:1958})\n", + "CREATE (RobertL:Person {name:'Robert Longo', born:1953})\n", + "CREATE\n", + "(Keanu)-[:ACTED_IN {roles:['Johnny Mnemonic']}]->(JohnnyMnemonic),\n", + "(Takeshi)-[:ACTED_IN {roles:['Takahashi']}]->(JohnnyMnemonic),\n", + "(Dina)-[:ACTED_IN {roles:['Jane']}]->(JohnnyMnemonic),\n", + "(IceT)-[:ACTED_IN {roles:['J-Bone']}]->(JohnnyMnemonic),\n", + "(RobertL)-[:DIRECTED]->(JohnnyMnemonic)\n", + "\n", + "CREATE (CloudAtlas:Movie {title:'Cloud Atlas', released:2012, tagline:'Everything is connected'})\n", + "CREATE (HalleB:Person {name:'Halle Berry', born:1966})\n", + "CREATE (JimB:Person {name:'Jim Broadbent', born:1949})\n", + "CREATE (TomT:Person {name:'Tom Tykwer', born:1965})\n", + "CREATE (DavidMitchell:Person {name:'David Mitchell', born:1969})\n", + "CREATE (StefanArndt:Person {name:'Stefan Arndt', born:1961})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Zachry', 'Dr. Henry Goose', 'Isaac Sachs', 'Dermot Hoggins']}]->(CloudAtlas),\n", + "(Hugo)-[:ACTED_IN {roles:['Bill Smoke', 'Haskell Moore', 'Tadeusz Kesselring', 'Nurse Noakes', 'Boardman Mephi', 'Old Georgie']}]->(CloudAtlas),\n", + "(HalleB)-[:ACTED_IN {roles:['Luisa Rey', 'Jocasta Ayrs', 'Ovid', 'Meronym']}]->(CloudAtlas),\n", + "(JimB)-[:ACTED_IN {roles:['Vyvyan Ayrs', 'Captain Molyneux', 'Timothy Cavendish']}]->(CloudAtlas),\n", + "(TomT)-[:DIRECTED]->(CloudAtlas),\n", + "(LillyW)-[:DIRECTED]->(CloudAtlas),\n", + "(LanaW)-[:DIRECTED]->(CloudAtlas),\n", + "(DavidMitchell)-[:WROTE]->(CloudAtlas),\n", + "(StefanArndt)-[:PRODUCED]->(CloudAtlas)\n", + "\n", + "CREATE (TheDaVinciCode:Movie {title:'The Da Vinci Code', released:2006, tagline:'Break The Codes'})\n", + "CREATE (IanM:Person {name:'Ian McKellen', born:1939})\n", + "CREATE (AudreyT:Person {name:'Audrey Tautou', born:1976})\n", + "CREATE (PaulB:Person {name:'Paul Bettany', born:1971})\n", + "CREATE (RonH:Person {name:'Ron Howard', born:1954})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Dr. Robert Langdon']}]->(TheDaVinciCode),\n", + "(IanM)-[:ACTED_IN {roles:['Sir Leight Teabing']}]->(TheDaVinciCode),\n", + "(AudreyT)-[:ACTED_IN {roles:['Sophie Neveu']}]->(TheDaVinciCode),\n", + "(PaulB)-[:ACTED_IN {roles:['Silas']}]->(TheDaVinciCode),\n", + "(RonH)-[:DIRECTED]->(TheDaVinciCode)\n", + "\n", + "CREATE (VforVendetta:Movie {title:'V for Vendetta', released:2006, tagline:'Freedom! Forever!'})\n", + "CREATE (NatalieP:Person {name:'Natalie Portman', born:1981})\n", + "CREATE (StephenR:Person {name:'Stephen Rea', born:1946})\n", + "CREATE (JohnH:Person {name:'John Hurt', born:1940})\n", + "CREATE (BenM:Person {name: 'Ben Miles', born:1967})\n", + "CREATE\n", + "(Hugo)-[:ACTED_IN {roles:['V']}]->(VforVendetta),\n", + "(NatalieP)-[:ACTED_IN {roles:['Evey Hammond']}]->(VforVendetta),\n", + "(StephenR)-[:ACTED_IN {roles:['Eric Finch']}]->(VforVendetta),\n", + "(JohnH)-[:ACTED_IN {roles:['High Chancellor Adam Sutler']}]->(VforVendetta),\n", + "(BenM)-[:ACTED_IN {roles:['Dascomb']}]->(VforVendetta),\n", + "(JamesM)-[:DIRECTED]->(VforVendetta),\n", + "(LillyW)-[:PRODUCED]->(VforVendetta),\n", + "(LanaW)-[:PRODUCED]->(VforVendetta),\n", + "(JoelS)-[:PRODUCED]->(VforVendetta),\n", + "(LillyW)-[:WROTE]->(VforVendetta),\n", + "(LanaW)-[:WROTE]->(VforVendetta)\n", + "\n", + "CREATE (SpeedRacer:Movie {title:'Speed Racer', released:2008, tagline:'Speed has no limits'})\n", + "CREATE (EmileH:Person {name:'Emile Hirsch', born:1985})\n", + "CREATE (JohnG:Person {name:'John Goodman', born:1960})\n", + "CREATE (SusanS:Person {name:'Susan Sarandon', born:1946})\n", + "CREATE (MatthewF:Person {name:'Matthew Fox', born:1966})\n", + "CREATE (ChristinaR:Person {name:'Christina Ricci', born:1980})\n", + "CREATE (Rain:Person {name:'Rain', born:1982})\n", + "CREATE\n", + "(EmileH)-[:ACTED_IN {roles:['Speed Racer']}]->(SpeedRacer),\n", + "(JohnG)-[:ACTED_IN {roles:['Pops']}]->(SpeedRacer),\n", + "(SusanS)-[:ACTED_IN {roles:['Mom']}]->(SpeedRacer),\n", + "(MatthewF)-[:ACTED_IN {roles:['Racer X']}]->(SpeedRacer),\n", + "(ChristinaR)-[:ACTED_IN {roles:['Trixie']}]->(SpeedRacer),\n", + "(Rain)-[:ACTED_IN {roles:['Taejo Togokahn']}]->(SpeedRacer),\n", + "(BenM)-[:ACTED_IN {roles:['Cass Jones']}]->(SpeedRacer),\n", + "(LillyW)-[:DIRECTED]->(SpeedRacer),\n", + "(LanaW)-[:DIRECTED]->(SpeedRacer),\n", + "(LillyW)-[:WROTE]->(SpeedRacer),\n", + "(LanaW)-[:WROTE]->(SpeedRacer),\n", + "(JoelS)-[:PRODUCED]->(SpeedRacer)\n", + "\n", + "CREATE (NinjaAssassin:Movie {title:'Ninja Assassin', released:2009, tagline:'Prepare to enter a secret world of assassins'})\n", + "CREATE (NaomieH:Person {name:'Naomie Harris'})\n", + "CREATE\n", + "(Rain)-[:ACTED_IN {roles:['Raizo']}]->(NinjaAssassin),\n", + "(NaomieH)-[:ACTED_IN {roles:['Mika Coretti']}]->(NinjaAssassin),\n", + "(RickY)-[:ACTED_IN {roles:['Takeshi']}]->(NinjaAssassin),\n", + "(BenM)-[:ACTED_IN {roles:['Ryan Maslow']}]->(NinjaAssassin),\n", + "(JamesM)-[:DIRECTED]->(NinjaAssassin),\n", + "(LillyW)-[:PRODUCED]->(NinjaAssassin),\n", + "(LanaW)-[:PRODUCED]->(NinjaAssassin),\n", + "(JoelS)-[:PRODUCED]->(NinjaAssassin)\n", + "\n", + "CREATE (TheGreenMile:Movie {title:'The Green Mile', released:1999, tagline:\"Walk a mile you'll never forget.\"})\n", + "CREATE (MichaelD:Person {name:'Michael Clarke Duncan', born:1957})\n", + "CREATE (DavidM:Person {name:'David Morse', born:1953})\n", + "CREATE (SamR:Person {name:'Sam Rockwell', born:1968})\n", + "CREATE (GaryS:Person {name:'Gary Sinise', born:1955})\n", + "CREATE (PatriciaC:Person {name:'Patricia Clarkson', born:1959})\n", + "CREATE (FrankD:Person {name:'Frank Darabont', born:1959})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Paul Edgecomb']}]->(TheGreenMile),\n", + "(MichaelD)-[:ACTED_IN {roles:['John Coffey']}]->(TheGreenMile),\n", + "(DavidM)-[:ACTED_IN {roles:['Brutus \"Brutal\" Howell']}]->(TheGreenMile),\n", + "(BonnieH)-[:ACTED_IN {roles:['Jan Edgecomb']}]->(TheGreenMile),\n", + "(JamesC)-[:ACTED_IN {roles:['Warden Hal Moores']}]->(TheGreenMile),\n", + "(SamR)-[:ACTED_IN {roles:['\"Wild Bill\" Wharton']}]->(TheGreenMile),\n", + "(GaryS)-[:ACTED_IN {roles:['Burt Hammersmith']}]->(TheGreenMile),\n", + "(PatriciaC)-[:ACTED_IN {roles:['Melinda Moores']}]->(TheGreenMile),\n", + "(FrankD)-[:DIRECTED]->(TheGreenMile)\n", + "\n", + "CREATE (FrostNixon:Movie {title:'Frost/Nixon', released:2008, tagline:'400 million people were waiting for the truth.'})\n", + "CREATE (FrankL:Person {name:'Frank Langella', born:1938})\n", + "CREATE (MichaelS:Person {name:'Michael Sheen', born:1969})\n", + "CREATE (OliverP:Person {name:'Oliver Platt', born:1960})\n", + "CREATE\n", + "(FrankL)-[:ACTED_IN {roles:['Richard Nixon']}]->(FrostNixon),\n", + "(MichaelS)-[:ACTED_IN {roles:['David Frost']}]->(FrostNixon),\n", + "(KevinB)-[:ACTED_IN {roles:['Jack Brennan']}]->(FrostNixon),\n", + "(OliverP)-[:ACTED_IN {roles:['Bob Zelnick']}]->(FrostNixon),\n", + "(SamR)-[:ACTED_IN {roles:['James Reston, Jr.']}]->(FrostNixon),\n", + "(RonH)-[:DIRECTED]->(FrostNixon)\n", + "\n", + "CREATE (Hoffa:Movie {title:'Hoffa', released:1992, tagline:\"He didn't want law. He wanted justice.\"})\n", + "CREATE (DannyD:Person {name:'Danny DeVito', born:1944})\n", + "CREATE (JohnR:Person {name:'John C. Reilly', born:1965})\n", + "CREATE\n", + "(JackN)-[:ACTED_IN {roles:['Hoffa']}]->(Hoffa),\n", + "(DannyD)-[:ACTED_IN {roles:['Robert \"Bobby\" Ciaro']}]->(Hoffa),\n", + "(JTW)-[:ACTED_IN {roles:['Frank Fitzsimmons']}]->(Hoffa),\n", + "(JohnR)-[:ACTED_IN {roles:['Peter \"Pete\" Connelly']}]->(Hoffa),\n", + "(DannyD)-[:DIRECTED]->(Hoffa)\n", + "\n", + "CREATE (Apollo13:Movie {title:'Apollo 13', released:1995, tagline:'Houston, we have a problem.'})\n", + "CREATE (EdH:Person {name:'Ed Harris', born:1950})\n", + "CREATE (BillPax:Person {name:'Bill Paxton', born:1955})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Jim Lovell']}]->(Apollo13),\n", + "(KevinB)-[:ACTED_IN {roles:['Jack Swigert']}]->(Apollo13),\n", + "(EdH)-[:ACTED_IN {roles:['Gene Kranz']}]->(Apollo13),\n", + "(BillPax)-[:ACTED_IN {roles:['Fred Haise']}]->(Apollo13),\n", + "(GaryS)-[:ACTED_IN {roles:['Ken Mattingly']}]->(Apollo13),\n", + "(RonH)-[:DIRECTED]->(Apollo13)\n", + "\n", + "CREATE (Twister:Movie {title:'Twister', released:1996, tagline:\"Don't Breathe. Don't Look Back.\"})\n", + "CREATE (PhilipH:Person {name:'Philip Seymour Hoffman', born:1967})\n", + "CREATE (JanB:Person {name:'Jan de Bont', born:1943})\n", + "CREATE\n", + "(BillPax)-[:ACTED_IN {roles:['Bill Harding']}]->(Twister),\n", + "(HelenH)-[:ACTED_IN {roles:['Dr. Jo Harding']}]->(Twister),\n", + "(ZachG)-[:ACTED_IN {roles:['Eddie']}]->(Twister),\n", + "(PhilipH)-[:ACTED_IN {roles:['Dustin \"Dusty\" Davis']}]->(Twister),\n", + "(JanB)-[:DIRECTED]->(Twister)\n", + "\n", + "CREATE (CastAway:Movie {title:'Cast Away', released:2000, tagline:'At the edge of the world, his journey begins.'})\n", + "CREATE (RobertZ:Person {name:'Robert Zemeckis', born:1951})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Chuck Noland']}]->(CastAway),\n", + "(HelenH)-[:ACTED_IN {roles:['Kelly Frears']}]->(CastAway),\n", + "(RobertZ)-[:DIRECTED]->(CastAway)\n", + "\n", + "CREATE (OneFlewOvertheCuckoosNest:Movie {title:\"One Flew Over the Cuckoo's Nest\", released:1975, tagline:\"If he's crazy, what does that make you?\"})\n", + "CREATE (MilosF:Person {name:'Milos Forman', born:1932})\n", + "CREATE\n", + "(JackN)-[:ACTED_IN {roles:['Randle McMurphy']}]->(OneFlewOvertheCuckoosNest),\n", + "(DannyD)-[:ACTED_IN {roles:['Martini']}]->(OneFlewOvertheCuckoosNest),\n", + "(MilosF)-[:DIRECTED]->(OneFlewOvertheCuckoosNest)\n", + "\n", + "CREATE (SomethingsGottaGive:Movie {title:\"Something's Gotta Give\", released:2003})\n", + "CREATE (DianeK:Person {name:'Diane Keaton', born:1946})\n", + "CREATE (NancyM:Person {name:'Nancy Meyers', born:1949})\n", + "CREATE\n", + "(JackN)-[:ACTED_IN {roles:['Harry Sanborn']}]->(SomethingsGottaGive),\n", + "(DianeK)-[:ACTED_IN {roles:['Erica Barry']}]->(SomethingsGottaGive),\n", + "(Keanu)-[:ACTED_IN {roles:['Julian Mercer']}]->(SomethingsGottaGive),\n", + "(NancyM)-[:DIRECTED]->(SomethingsGottaGive),\n", + "(NancyM)-[:PRODUCED]->(SomethingsGottaGive),\n", + "(NancyM)-[:WROTE]->(SomethingsGottaGive)\n", + "\n", + "CREATE (BicentennialMan:Movie {title:'Bicentennial Man', released:1999, tagline:\"One robot's 200 year journey to become an ordinary man.\"})\n", + "CREATE (ChrisC:Person {name:'Chris Columbus', born:1958})\n", + "CREATE\n", + "(Robin)-[:ACTED_IN {roles:['Andrew Marin']}]->(BicentennialMan),\n", + "(OliverP)-[:ACTED_IN {roles:['Rupert Burns']}]->(BicentennialMan),\n", + "(ChrisC)-[:DIRECTED]->(BicentennialMan)\n", + "\n", + "CREATE (CharlieWilsonsWar:Movie {title:\"Charlie Wilson's War\", released:2007, tagline:\"A stiff drink. A little mascara. A lot of nerve. Who said they couldn't bring down the Soviet empire.\"})\n", + "CREATE (JuliaR:Person {name:'Julia Roberts', born:1967})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Rep. Charlie Wilson']}]->(CharlieWilsonsWar),\n", + "(JuliaR)-[:ACTED_IN {roles:['Joanne Herring']}]->(CharlieWilsonsWar),\n", + "(PhilipH)-[:ACTED_IN {roles:['Gust Avrakotos']}]->(CharlieWilsonsWar),\n", + "(MikeN)-[:DIRECTED]->(CharlieWilsonsWar)\n", + "\n", + "CREATE (ThePolarExpress:Movie {title:'The Polar Express', released:2004, tagline:'This Holiday Season... Believe'})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Hero Boy', 'Father', 'Conductor', 'Hobo', 'Scrooge', 'Santa Claus']}]->(ThePolarExpress),\n", + "(RobertZ)-[:DIRECTED]->(ThePolarExpress)\n", + "\n", + "CREATE (ALeagueofTheirOwn:Movie {title:'A League of Their Own', released:1992, tagline:'Once in a lifetime you get a chance to do something different.'})\n", + "CREATE (Madonna:Person {name:'Madonna', born:1954})\n", + "CREATE (GeenaD:Person {name:'Geena Davis', born:1956})\n", + "CREATE (LoriP:Person {name:'Lori Petty', born:1963})\n", + "CREATE (PennyM:Person {name:'Penny Marshall', born:1943})\n", + "CREATE\n", + "(TomH)-[:ACTED_IN {roles:['Jimmy Dugan']}]->(ALeagueofTheirOwn),\n", + "(GeenaD)-[:ACTED_IN {roles:['Dottie Hinson']}]->(ALeagueofTheirOwn),\n", + "(LoriP)-[:ACTED_IN {roles:['Kit Keller']}]->(ALeagueofTheirOwn),\n", + "(RosieO)-[:ACTED_IN {roles:['Doris Murphy']}]->(ALeagueofTheirOwn),\n", + "(Madonna)-[:ACTED_IN {roles:['\"All the Way\" Mae Mordabito']}]->(ALeagueofTheirOwn),\n", + "(BillPax)-[:ACTED_IN {roles:['Bob Hinson']}]->(ALeagueofTheirOwn),\n", + "(PennyM)-[:DIRECTED]->(ALeagueofTheirOwn)\n", + "\n", + "CREATE (PaulBlythe:Person {name:'Paul Blythe'})\n", + "CREATE (AngelaScope:Person {name:'Angela Scope'})\n", + "CREATE (JessicaThompson:Person {name:'Jessica Thompson'})\n", + "CREATE (JamesThompson:Person {name:'James Thompson'})\n", + "\n", + "CREATE\n", + "(JamesThompson)-[:FOLLOWS]->(JessicaThompson),\n", + "(AngelaScope)-[:FOLLOWS]->(JessicaThompson),\n", + "(PaulBlythe)-[:FOLLOWS]->(AngelaScope)\n", + "\n", + "CREATE\n", + "(JessicaThompson)-[:REVIEWED {summary:'An amazing journey', rating:95}]->(CloudAtlas),\n", + "(JessicaThompson)-[:REVIEWED {summary:'Silly, but fun', rating:65}]->(TheReplacements),\n", + "(JamesThompson)-[:REVIEWED {summary:'The coolest football movie ever', rating:100}]->(TheReplacements),\n", + "(AngelaScope)-[:REVIEWED {summary:'Pretty funny at times', rating:62}]->(TheReplacements),\n", + "(JessicaThompson)-[:REVIEWED {summary:'Dark, but compelling', rating:85}]->(Unforgiven),\n", + "(JessicaThompson)-[:REVIEWED {summary:\"Slapstick redeemed only by the Robin Williams and Gene Hackman's stellar performances\", rating:45}]->(TheBirdcage),\n", + "(JessicaThompson)-[:REVIEWED {summary:'A solid romp', rating:68}]->(TheDaVinciCode),\n", + "(JamesThompson)-[:REVIEWED {summary:'Fun, but a little far fetched', rating:65}]->(TheDaVinciCode),\n", + "(JessicaThompson)-[:REVIEWED {summary:'You had me at Jerry', rating:92}]->(JerryMaguire)\n", + "\n", + "WITH TomH as a\n", + "MATCH (a)-[:ACTED_IN]->(m)<-[:DIRECTED]-(d) RETURN a,m,d LIMIT 10;" + ], + "metadata": { + "id": "QFbjo1k24YEY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "You can query the database via **cy2py** in this simple way" + ], + "metadata": { + "id": "peqcEHj0b35T" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "CALL apoc.meta.graph()" + ], + "metadata": { + "id": "BfFOTNkncMqp" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "As you can see the model is exactely how we expect!" + ], + "metadata": { + "id": "sGu-zpk8nY5r" + } + }, + { + "cell_type": "code", + "source": [ + "# this step is MANDATORY for the exercises\n", + "from neo4j import GraphDatabase\n", + "neo4j_driver = GraphDatabase.driver(neo4j_url, auth=(neo4j_user, neo4j_password))" + ], + "metadata": { + "id": "_zZF1guo58cc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c8bQe1b-7RY-" + }, + "source": [ + "# Read data from Neo4j into Spark\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "The query above generates the following graph model:\n", + "\n" + ], + "metadata": { + "id": "ovoUnDmocaxK" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B1LLHYf1CsPh" + }, + "source": [ + "## Read nodes via `labels` option" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "omdSk6ShCqfA" + }, + "outputs": [], + "source": [ + "movies_df = (spark.read\n", + " .format('org.neo4j.spark.DataSource')\n", + " .option('labels', ':Movie')\n", + " .load())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RyglSgXnQcar" + }, + "source": [ + "### Schema description" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f9AaUINjPH4n" + }, + "outputs": [], + "source": [ + "movies_df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "The `movies_df` contains a set of fields, the first two (generally) are always:\n", + "\n", + "* `` which represents the internal Neo4j id\n", + "* `` which represents the list of labels attached to the node\n", + "\n", + "All other properties are taken from the node via schema resolution by using APOC or Cypher queries" + ], + "metadata": { + "id": "jxLcYSkgZ1xf" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "omGjaw5QDgS-" + }, + "outputs": [], + "source": [ + "movies_df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7-KTRC5HD5sO" + }, + "source": [ + "### Exercise\n", + "\n", + "Read all the `Person` nodes store them into a Python variable called `person_df` and then verify the results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZhnsFC9KEsLp" + }, + "outputs": [], + "source": [ + "person_df = # write your spark code here" + ] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "person_df = (spark.read\n", + " .format('org.neo4j.spark.DataSource')\n", + " .option('labels', ':Person')\n", + " .load())\n", + "```\n", + "\n", + "
\n", + "\n" + ], + "metadata": { + "id": "O4WEzidAZBh-" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "D5_zXyweE-QM" + }, + "outputs": [], + "source": [ + "\"\"\"\n", + " This paragraph is for validating the code the you\n", + " wrote above, please execute it after you\n", + " created the person_df\n", + "\"\"\"\n", + "\n", + "assert person_df.count() == 133\n", + "assert person_df.schema.fieldNames() == ['', '', 'name', 'born']\n", + "assert person_df.collect()[0][\"\"] == ['Person']\n", + "print(\"All assertion are successfuly satisfied. Congrats you created your first DataFrame\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m1hgGMLCRoZx" + }, + "source": [ + "## Read relationships via `relationship` option" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HgPockV0I5Q3" + }, + "source": [ + "There are two way to transform relationships into DataFrame\n", + "\n", + "* having all the node and relationship data flattened into the DataFrame\n", + "* having all the node properties in maps and the relationship data as columns" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m0DBqZLtKtvX" + }, + "source": [ + "### DataFrame with flattened data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "796cuMwXR2zi" + }, + "outputs": [], + "source": [ + "actedin_df = (spark.read\n", + " .format('org.neo4j.spark.DataSource')\n", + " .option('relationship', 'ACTED_IN')\n", + " .option('relationship.source.labels', ':Person')\n", + " .option('relationship.target.labels', ':Movie')\n", + " .load())" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Schema description" + ], + "metadata": { + "id": "yzyviI5vXO4K" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5uDWZqoySNGc" + }, + "outputs": [], + "source": [ + "actedin_df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "The `movies_df` contains a set of fields, the first two (generally) are always:\n", + "\n", + "* `` which represents the internal Neo4j relationship id\n", + "* `` which represents the relationship type\n", + "* `` which represents the internal Neo4j node id\n", + "* `` which represents the list of labels attached to the node\n", + "* `rel.*` which represents the properties attached to the relationship\n", + "* `source/target.*` which represents the properties attached to the node\n", + "\n", + "All other properties are taken from the node via schema resolution by using APOC or Cypher queries" + ], + "metadata": { + "id": "2dB9DL7KZxrX" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VPHDTL-IUX2X" + }, + "outputs": [], + "source": [ + "actedin_df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RoPVDptGKy_m" + }, + "source": [ + "### DataFrame with nodes as map" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8VxgDlBXIt_h" + }, + "outputs": [], + "source": [ + "actedin_map_df = (spark.read\n", + " .format('org.neo4j.spark.DataSource')\n", + " .option('relationship.nodes.map', True)\n", + " .option('relationship', 'ACTED_IN')\n", + " .option('relationship.source.labels', ':Person')\n", + " .option('relationship.target.labels', ':Movie')\n", + " .load())" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Schema description" + ], + "metadata": { + "id": "nbGAYcd7YMOp" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gF5-m4BbI2Ib" + }, + "outputs": [], + "source": [ + "actedin_map_df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "The `movies_df` contains a set of fields, the first two (generally) are always:\n", + "\n", + "* `` which represents the internal Neo4j relationship id\n", + "* `` which represents the relationship type\n", + "* `` which represents a map with node values\n", + "* `rel.*` which represents the properties attached to the relationship\n", + "\n", + "All other properties are taken from the node via schema resolution by using APOC or Cypher queries" + ], + "metadata": { + "id": "Zuu42SpfZ502" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UUYUwE3CLA_r" + }, + "outputs": [], + "source": [ + "actedin_map_df" + ] + }, + { + "cell_type": "code", + "source": [ + "actedin_map_df.collect()[0][\"\"]" + ], + "metadata": { + "id": "hHPq7neyYDSx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Exercise\n", + "\n", + "Read all the `DIRECTED` relationships" + ], + "metadata": { + "id": "Viop-9_thCbF" + } + }, + { + "cell_type": "code", + "source": [ + "directed_df = # write your spark code here" + ], + "metadata": { + "id": "j0tTsk59hhLh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "directed_df = (spark.read\n", + " .format('org.neo4j.spark.DataSource')\n", + " .option('relationship', 'DIRECTED')\n", + " .option('relationship.source.labels', ':Person')\n", + " .option('relationship.target.labels', ':Movie')\n", + " .load())\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "VQYyYSMpj2lf" + } + }, + { + "cell_type": "code", + "source": [ + "\"\"\"\n", + " This paragraph is for validating the code the you\n", + " wrote above, please execute it after you\n", + " created the directed_df\n", + "\"\"\"\n", + "\n", + "assert directed_df.count() == 44\n", + "assert directed_df.schema.fieldNames() == ['',\n", + " '',\n", + " '',\n", + " '',\n", + " 'source.name',\n", + " 'source.born',\n", + " '',\n", + " '',\n", + " 'target.title',\n", + " 'target.tagline',\n", + " 'target.released']\n", + "assert directed_df.collect()[0][\"\"] == 'DIRECTED'\n", + "print(\"All assertion are successfuly satisfied. Congrats you created your first relationship DataFrame\")" + ], + "metadata": { + "id": "HRwaJ8PvhudP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Read arbitrary data via Cypher query" + ], + "metadata": { + "id": "hCpzW904dS2r" + } + }, + { + "cell_type": "code", + "source": [ + "cypher_df = (spark.read\n", + " .format('org.neo4j.spark.DataSource')\n", + " .option('query', '''\n", + " // Extend Tom Hanks co-actors, to find co-co-actors who haven't worked with Tom Hanks\n", + " MATCH (tom:Person {name:\"Tom Hanks\"})-[:ACTED_IN]->(m)<-[:ACTED_IN]-(coActors),\n", + " (coActors)-[:ACTED_IN]->(m2)<-[:ACTED_IN]-(cocoActors)\n", + " WHERE NOT (tom)-[:ACTED_IN]->()<-[:ACTED_IN]-(cocoActors)\n", + " AND tom <> cocoActors\n", + " RETURN cocoActors.name AS Recommended, count(*) AS Strength\n", + " ORDER BY Strength DESC\n", + " ''')\n", + " .load())" + ], + "metadata": { + "id": "hplBy0b_dhnb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Schema description" + ], + "metadata": { + "id": "tRZPA6xWeSCT" + } + }, + { + "cell_type": "code", + "source": [ + "cypher_df.printSchema()" + ], + "metadata": { + "id": "hU-JfgNNeL5f" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "cypher_df" + ], + "metadata": { + "id": "8IcUQsileXQ7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "// Just for debugging purposes let's check the same query directly from the database\n", + "MATCH (tom:Person {name:\"Tom Hanks\"})-[:ACTED_IN]->(m)<-[:ACTED_IN]-(coActors),\n", + " (coActors)-[:ACTED_IN]->(m2)<-[:ACTED_IN]-(cocoActors)\n", + "WHERE NOT (tom)-[:ACTED_IN]->()<-[:ACTED_IN]-(cocoActors)\n", + " AND tom <> cocoActors\n", + "RETURN cocoActors.name AS Recommended, count(*) AS Strength\n", + "ORDER BY Strength DESC\n", + "LIMIT 20" + ], + "metadata": { + "id": "CWtNAeoN4O6S" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Exercise\n", + "\n", + "Return all the actors that have also directed a movie.\n", + "\n", + "The returned DataFrame must have 3 columns:\n", + "\n", + "* `name` the actor name\n", + "* `acted_in` a list of unique films (title) where he acted in\n", + "* `directed` a list of unique films (title) where he was a director" + ], + "metadata": { + "id": "Hyq4KsKQegdE" + } + }, + { + "cell_type": "code", + "source": [ + "your_cypher_df = # write your spark code here" + ], + "metadata": { + "id": "0h1FFuYxej2f" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "your_cypher_df = (spark.read\n", + " .format('org.neo4j.spark.DataSource')\n", + " .option('query', '''\n", + " MATCH (p:Person)\n", + " MATCH (p)-[:ACTED_IN]->(m:Movie)\n", + " MATCH (p)-[:DIRECTED]->(m1:Movie)\n", + " RETURN p.name AS name, collect(m.title) AS acted_in, collect(m1.title) AS directed\n", + " ''')\n", + " .load())\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "CQncCY52lCxv" + } + }, + { + "cell_type": "code", + "source": [ + "\"\"\"\n", + " This paragraph is for validating the code the you\n", + " wrote above, please execute it after you\n", + " created the your_cypher_df\n", + "\"\"\"\n", + "\n", + "assert your_cypher_df.count() == 5\n", + "assert your_cypher_df.schema.fieldNames() == ['name', 'acted_in', 'directed']\n", + "your_cypher_df_collect = your_cypher_df.collect()\n", + "assert frozenset(map(lambda row: row['name'], your_cypher_df_collect)) == frozenset(['Clint Eastwood',\n", + " 'Danny DeVito',\n", + " 'James Marshall',\n", + " 'Werner Herzog',\n", + " 'Tom Hanks'])\n", + "assert frozenset(map(lambda row: frozenset(row['acted_in']), your_cypher_df_collect)) == set([\n", + " frozenset([\"Apollo 13\", \"You've Got Mail\", \"A League of Their Own\", \"Joe Versus the Volcano\", \"That Thing You Do\", \"The Da Vinci Code\", \"Cloud Atlas\", \"Cast Away\", \"The Green Mile\", \"Sleepless in Seattle\", \"The Polar Express\", \"Charlie Wilson's War\"]),\n", + " frozenset([\"What Dreams May Come\"]),\n", + " frozenset([\"Unforgiven\"]),\n", + " frozenset([\"A Few Good Men\"]),\n", + " frozenset([\"Hoffa\", \"One Flew Over the Cuckoo's Nest\"])\n", + " ])\n", + "assert frozenset(map(lambda row: frozenset(row['directed']), your_cypher_df_collect)) == set([\n", + " frozenset([\"That Thing You Do\"]),\n", + " frozenset([\"RescueDawn\"]),\n", + " frozenset([\"Unforgiven\"]),\n", + " frozenset([\"V for Vendetta\", \"Ninja Assassin\"]),\n", + " frozenset([\"Hoffa\"])\n", + " ])\n", + "print(\"All assertion are successfuly satisfied. Congrats you created your first cypher dataframe\")" + ], + "metadata": { + "id": "xG_7Wy-_go5V" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hFpA11aK8ADf" + }, + "source": [ + "# Write data from Spark to Neo4j" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## The graph model\n", + "\n", + "Our goal is to create this simple graph model\n", + "\n", + "" + ], + "metadata": { + "id": "Mx84Qi1PcHF_" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Trt-L_9pMQf1" + }, + "source": [ + "### Download The Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "h4o07NpuJmaG" + }, + "outputs": [], + "source": [ + "!wget -q https://s3.amazonaws.com/dev.assets.neo4j.com/wp-content/uploads/desktop-csv-import.zip" + ] + }, + { + "cell_type": "markdown", + "source": [ + "The zip is composed of three files:\n", + "* products.csv: describes the products and has three columns (and no header)\n", + "* orders.csv: has three columns (with the header) and describe the order\n", + "* order-details.csv: is the \"join\" table between orders and products; it has three columns with header" + ], + "metadata": { + "id": "KKfl_ZyhYYWj" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nduIG7H_J0-A" + }, + "outputs": [], + "source": [ + "!unzip desktop-csv-import.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w5kaTPoEQNkT" + }, + "source": [ + "### Explore the Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L0TZgi_E1gAv" + }, + "outputs": [], + "source": [ + "products_df = (spark.read\n", + " .format('csv')\n", + " .option('inferSchema', True)\n", + " .option('path', '/content/desktop-csv-import/products.csv')\n", + " .load())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0tptDEUn2WO6" + }, + "outputs": [], + "source": [ + "products_df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZSW40PxjKvgf" + }, + "outputs": [], + "source": [ + "products_df" + ] + }, + { + "cell_type": "markdown", + "source": [ + "As you can see in the schema, colums have no name, just a generic `_c` prefix concatenated with an index.\n", + "The three columns describe:\n", + "* `_c0` is the `id` of the product\n", + "* `_c1` is the `name`\n", + "* `_c2` is the `price`\n", + "\n", + "Let's rename these columns!" + ], + "metadata": { + "id": "VrKERh4uXu8J" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7xabRYlXQZl4" + }, + "outputs": [], + "source": [ + "products_df = (products_df.withColumnRenamed('_c0', 'id')\n", + " .withColumnRenamed('_c1', 'name')\n", + " .withColumnRenamed('_c2', 'price'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "w08HTcIeQuNn" + }, + "outputs": [], + "source": [ + "products_df.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XCkyYIyzQv0X" + }, + "outputs": [], + "source": [ + "products_df" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Write nodes via `label` option" + ], + "metadata": { + "id": "XL9oBe0-m680" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Oy4eoeAMRWxc" + }, + "outputs": [], + "source": [ + "(products_df.write\n", + " .format('org.neo4j.spark.DataSource')\n", + " .mode('append')\n", + " .option('labels', ':Product')\n", + " .save())" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Let's check if the nodes are in the database!" + ], + "metadata": { + "id": "MLxewMeFsQj3" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (n:Product)\n", + "RETURN n\n", + "LIMIT 10" + ], + "metadata": { + "id": "2yoBPvmmsUqt" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now just to be sure that we loaded all the nodes into Neo4j we'll count the dataframe and the nodes inside the database" + ], + "metadata": { + "id": "10PLvdmZ0tZT" + } + }, + { + "cell_type": "code", + "source": [ + "products_df.count()" + ], + "metadata": { + "id": "AmwadsK702Sl" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (n:Product)\n", + "RETURN count(n)" + ], + "metadata": { + "id": "J8e9jZPG06DL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "If the two counts are equal, all the data has been properly imported." + ], + "metadata": { + "id": "T8XAaw3K1Mga" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Create Constraints\n", + "\n", + "Oh but wait, we forgot to create constraints!!! if we go into the Neo4j browser and excute the following query:\n", + "\n", + "```cypher\n", + "show constraints\n", + "```\n", + "\n", + "We should get the constraints of the movie database, but not one for `Product`.\n", + "\n", + "So please create the constaints for the node `Product`:\n", + "\n", + "```cypher\n", + "CREATE CONSTRAINT product_id FOR (p:Product) REQUIRE p.id IS UNIQUE;\n", + "```\n", + "\n", + "But if you want, you can also delegate the Spark connector to perform optimizations pre-processing by usign the option `schema.optimization.type` which can assume three values:\n", + "\n", + "* `INDEX`: it creates only indexes on provided nodes.\n", + "* `NODE_CONSTRAINTS`: it creates only indexes on provided nodes.\n", + "\n", + "So let's create the `Order` node with by let the connector creating the constraints for you" + ], + "metadata": { + "id": "jFL_tQk0tsni" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "// Check the constraints\n", + "SHOW CONSTRAINTS" + ], + "metadata": { + "id": "T3PUsfIvsi23" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "// Create the constraint for Product node\n", + "CREATE CONSTRAINT product_id IF NOT EXISTS FOR (p:Product) REQUIRE p.id IS UNIQUE;" + ], + "metadata": { + "id": "iMF68OU20XhE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "// Check (again) the constraints\n", + "SHOW CONSTRAINTS" + ], + "metadata": { + "id": "w_wSMYvz0fz_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "As you can see now we have the `product_id` constraint in the result list" + ], + "metadata": { + "id": "2JwYLMva1VnN" + } + }, + { + "cell_type": "code", + "source": [ + "orders_df = (spark.read\n", + " .format('csv')\n", + " .option('inferSchema', True)\n", + " .option('header', True)\n", + " .option('path', '/content/desktop-csv-import/orders.csv')\n", + " .load())" + ], + "metadata": { + "id": "m5ge2R_Ggd3K" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "orders_df.printSchema()" + ], + "metadata": { + "id": "7UGwHtuJwFU4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "orders_df" + ], + "metadata": { + "id": "uGmUP5ZMwJkm" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# we cast orderDate to timestamp in order to have it converted properly into Neo4j\n", + "orders_df = orders_df.selectExpr('orderID AS id', 'CAST(orderDate AS TIMESTAMP) AS date', 'shipCountry')" + ], + "metadata": { + "id": "InwYglcUwXNy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "orders_df.printSchema()" + ], + "metadata": { + "id": "kSJmW81cw_ES" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "orders_df" + ], + "metadata": { + "id": "7VN5RAizxSr2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "(orders_df.write\n", + " .format('org.neo4j.spark.DataSource')\n", + " .mode('overwrite')\n", + " .option('labels', ':Order')\n", + " .option('schema.optimization.type', 'NODE_CONSTRAINTS')\n", + " # this is necessary in order to specify what is the constraint field\n", + " .option('node.keys', 'id')\n", + " .save())" + ], + "metadata": { + "id": "0rwKB9V-xTzu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now let's check if the connector has created the constraint for us" + ], + "metadata": { + "id": "A_T2beEJx3Ho" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "SHOW CONSTRAINTS" + ], + "metadata": { + "id": "hDxdKyFY1sDT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "As you can see the we have the constraint `spark_NODE_CONSTRAINTS_Order_id` that has been create by the Spark connector itself.\n", + "\n", + "Now just because we're courious let's check if the data has been propertly loaded.\n", + "\n", + "The first thing to check is if the count of the Dataframe and the nodes in Neo4j matches." + ], + "metadata": { + "id": "H5Bne-Mq1vsO" + } + }, + { + "cell_type": "code", + "source": [ + "orders_df.count()" + ], + "metadata": { + "id": "mANrH-Zt2ShO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (o:Order)\n", + "RETURN count(o)" + ], + "metadata": { + "id": "vzCCVYAK2V0X" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now we want to check if the data has been loaded with the proper data type, in particular we created a new column `date` by casting `orderDate` to `TIMESTAMP`." + ], + "metadata": { + "id": "X8BBxCQg2dFM" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (o:Order)\n", + "RETURN apoc.meta.cypher.type(o.date), count(o)" + ], + "metadata": { + "id": "VN7XMI192xAP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "So all the `date` values have the same type." + ], + "metadata": { + "id": "PpXyKtJF3Lk4" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Exercise\n", + "\n", + "Given the `football_teams_df` and `football_player_df` below please:\n", + "* for `football_teams_df` insert it as nodes with label `:FootballTeam` in Neo4j.\n", + "* for `football_player_df` insert it as nodes with label `:FootballPlayer` in Neo4j.\n", + "\n", + "Create for both of them constraints via the schema optimization feature:\n", + "* for `football_teams_df` the key must be the property `id`\n", + "* for `football_player_df` the key must be the property `name`" + ], + "metadata": { + "id": "Zi4Dl2LqmVmN" + } + }, + { + "cell_type": "code", + "source": [ + "football_teams_df = spark.createDataFrame([{'id': 1, 'name': 'AC Milan'}, {'id': 2, 'name': 'FC Internazionale'}])\n", + "football_player_df = spark.createDataFrame([\n", + " {'name': 'Zlatan Ibrahimovic'},\n", + " {'name': 'Sandro Tonali'},\n", + " {'name': 'Nicolò Barella'},\n", + " {'name': 'Marcelo Brozovic'}])" + ], + "metadata": { + "id": "21tFDtgAmVON" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# write your spark code that persist football_teams_df and football_player_df here" + ], + "metadata": { + "id": "utpvz-fI6blD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "# write the teams\n", + "(football_teams_df.write\n", + " .format('org.neo4j.spark.DataSource')\n", + " .mode('overwrite')\n", + " .option('labels', ':FootballTeam')\n", + " .option('schema.optimization.type', 'NODE_CONSTRAINTS')\n", + " .option('node.keys', 'id')\n", + " .save())\n", + "# write the players\n", + "(football_player_df.write\n", + " .format('org.neo4j.spark.DataSource')\n", + " .mode('overwrite')\n", + " .option('labels', ':FootballPlayer')\n", + " .option('schema.optimization.type', 'NODE_CONSTRAINTS')\n", + " .option('node.keys', 'name')\n", + " .save())\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "xeVxPe7PmEy8" + } + }, + { + "cell_type": "code", + "source": [ + "\"\"\"\n", + " This paragraph is for validating the code the you\n", + " wrote above, please execute it after you\n", + " persisted football_teams_df and\n", + " football_player_df in Neo4j as nodes\n", + "\"\"\"\n", + "\n", + "with neo4j_driver.session() as session:\n", + " # count football players\n", + " football_players = session.read_transaction(lambda tx: (tx.run('''\n", + " MATCH (p:FootballPlayer)\n", + " WHERE p.name IN ['Zlatan Ibrahimovic', 'Sandro Tonali',\n", + " 'Nicolò Barella', 'Marcelo Brozovic']\n", + " RETURN count(p) AS count\n", + " ''').single()['count']))\n", + " assert football_players == 4\n", + "\n", + " # count football teams\n", + " football_teams = session.read_transaction(lambda tx: (tx.run('''\n", + " MATCH (p:FootballTeam)\n", + " WHERE p.name IN ['AC Milan', 'FC Internazionale']\n", + " RETURN count(p) AS count\n", + " ''').single()['count']))\n", + " assert football_teams == 2\n", + "\n", + " # count constraints\n", + " football_constraints = session.read_transaction(lambda tx: (tx.run('''\n", + " SHOW CONSTRAINTS YIELD name\n", + " WHERE name IN ['spark_NODE_CONSTRAINTS_FootballPlayer_name', 'spark_NODE_CONSTRAINTS_FootballTeam_id']\n", + " RETURN count(*) AS count\n", + " ''').single()['count']))\n", + " assert football_constraints == 2\n", + "\n", + "print(\"All assertion are successfuly satisfied. Congrats you saved your first Node DataFrame into Neo4j!\")" + ], + "metadata": { + "id": "a5zQCEyK6h5f" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Write relationships via `relationship` option" + ], + "metadata": { + "id": "QltYkhIuy2Kc" + } + }, + { + "cell_type": "code", + "source": [ + "order_details_df = (spark.read\n", + " .format('csv')\n", + " .option('inferSchema', True)\n", + " .option('header', True)\n", + " .option('path', '/content/desktop-csv-import/order-details.csv')\n", + " .load())" + ], + "metadata": { + "id": "y3U_X0b7x2UN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "order_details_df.printSchema()" + ], + "metadata": { + "id": "zFqKW-j-zRyk" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "order_details_df" + ], + "metadata": { + "id": "5s5f3W984Jc0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Please remember that this is the pattern that we want to ingest:\n", + "\n", + "\n", + "" + ], + "metadata": { + "id": "aZWSOtNK2qEw" + } + }, + { + "cell_type": "code", + "source": [ + "(order_details_df.write\n", + " .format('org.neo4j.spark.DataSource')\n", + " .mode('overwrite')\n", + " .option('relationship', 'CONTAINS')\n", + " .option('relationship.save.strategy', 'keys')\n", + " .option('relationship.source.labels', ':Product')\n", + " .option('relationship.source.save.mode', 'Match')\n", + " .option('relationship.source.node.keys', 'productID:id')\n", + " .option('relationship.target.labels', ':Order')\n", + " .option('relationship.target.save.mode', 'Match')\n", + " .option('relationship.target.node.keys', 'orderID:id')\n", + " .option('relationship.properties', 'quantity:quantityOrdered')\n", + " .save())" + ], + "metadata": { + "id": "rFo3KWA90rmZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now let's check the count for both Dataframe and relationships in Neo4j" + ], + "metadata": { + "id": "1PaC36iZ3bNf" + } + }, + { + "cell_type": "code", + "source": [ + "order_details_df.count()" + ], + "metadata": { + "id": "OUZ5FYHP3qvj" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (p:Product)-[r:CONTAINS]->(o:Order)\n", + "RETURN count(r)" + ], + "metadata": { + "id": "ex8o2pSo34cI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Exercise\n", + "\n", + "Given the `team_player_df` create a relationship between `:FootballPlayer` and `:FootballTeam` of type `PLAYS_FOR`:\n", + "\n", + "```cypher\n", + "(:FootballPlayer)-[:PLAYS_FOR]->(:FootballTeam)\n", + "```" + ], + "metadata": { + "id": "lTBo369687lC" + } + }, + { + "cell_type": "code", + "source": [ + "team_player_df = spark.createDataFrame([\n", + " {'id': 1, 'football_player': 'Zlatan Ibrahimovic'},\n", + " {'id': 1, 'football_player': 'Sandro Tonali'},\n", + " {'id': 2, 'football_player': 'Nicolò Barella'},\n", + " {'id': 2, 'football_player': 'Marcelo Brozovic'}])" + ], + "metadata": { + "id": "gi9kB0l49f8H" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# write your spark code that persist team_player_df here" + ], + "metadata": { + "id": "i7ZjASDx_Kiy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "(team_player_df.write\n", + " .format('org.neo4j.spark.DataSource')\n", + " .mode('overwrite')\n", + " .option('relationship', 'PLAYS_FOR')\n", + " .option('relationship.save.strategy', 'keys')\n", + " .option('relationship.source.labels', ':FootballPlayer')\n", + " .option('relationship.source.save.mode', 'Match')\n", + " .option('relationship.source.node.keys', 'football_player:name')\n", + " .option('relationship.target.labels', ':FootballTeam')\n", + " .option('relationship.target.save.mode', 'Match')\n", + " .option('relationship.target.node.keys', 'id')\n", + " .save())\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "oNRJevUSm0Wi" + } + }, + { + "cell_type": "code", + "source": [ + "\"\"\"\n", + " This paragraph is for validating the code the you\n", + " wrote above, please execute it after you\n", + " persisted team_player_df as relationships\n", + "\"\"\"\n", + "\n", + "with neo4j_driver.session() as session:\n", + " # count relationships\n", + " def count_relationships(tx):\n", + " result = tx.run('''\n", + " MATCH (p:FootballPlayer)-[:PLAYS_FOR]->(t:FootballTeam)\n", + " RETURN t.name AS team, collect(p.name) AS players\n", + " ORDER by team\n", + " ''')\n", + " return [{'team': record['team'], 'players': set(record['players'])} for record in result]\n", + "\n", + " actual = session.read_transaction(count_relationships)\n", + " expected = [\n", + " {'team': 'AC Milan', 'players': frozenset(['Zlatan Ibrahimovic', 'Sandro Tonali'])},\n", + " {'team': 'FC Internazionale', 'players': frozenset(['Nicolò Barella', 'Marcelo Brozovic'])}\n", + " ]\n", + " assert actual == expected\n", + "\n", + "print(\"All assertion are successfuly satisfied. Congrats you saved your first Relationship DataFrame into Neo4j!\")" + ], + "metadata": { + "id": "LDYbmoUx_Owb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Write custom graphs via Cypher Query\n", + "\n", + "Now let's consider that two actors created an order and bought several products, and we want to add information in our database." + ], + "metadata": { + "id": "vBq9NCWlZkHw" + } + }, + { + "cell_type": "code", + "source": [ + "actor_orders = [\n", + " {'actor_name': 'Cuba Gooding Jr.', 'order_id': 1, 'products': [11, 42, 72], 'quantities': [1, 2, 3], 'order_date': '2022-06-07 00:00:00'},\n", + " {'actor_name': 'Tom Hanks', 'order_id': 2, 'products': [24, 55, 75], 'quantities': [3, 2, 1], 'order_date': '2022-06-06 00:00:00'}\n", + "]\n", + "\n", + "actor_orders_df = spark.createDataFrame(actor_orders)" + ], + "metadata": { + "id": "3y_yEOouaHxe" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "actor_orders_df.printSchema()" + ], + "metadata": { + "id": "hYo7nlgvbfdm" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "actor_orders_df" + ], + "metadata": { + "id": "fOkW4w2lbhZP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "In this case please go into Neo4j and create the following constraint:\n", + "\n", + "```cypher\n", + "CREATE CONSTRAINT person_name FOR (p:Person) REQUIRE p.name is UNIQUE;\n", + "```" + ], + "metadata": { + "id": "q0wAIB5l7qp9" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "// if you didn't before create the constraint on Person.name\n", + "CREATE CONSTRAINT person_name IF NOT EXISTS FOR (p:Person) REQUIRE p.name is UNIQUE;" + ], + "metadata": { + "id": "-_Lxm_zV4qw2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "(actor_orders_df.write\n", + " .format('org.neo4j.spark.DataSource')\n", + " .mode('overwrite')\n", + " .option('query', '''\n", + " MATCH (person:Person {name: event.actor_name})\n", + " MERGE (order:Order {id: event.order_id, date: datetime(replace(event.order_date, ' ', 'T'))})\n", + " MERGE (person)-[:CREATED]->(order)\n", + " WITH event, order\n", + " UNWIND range(0, size(event.products) - 1) AS index\n", + " MATCH (product:Product {id: event.products[index]})\n", + " MERGE (product)-[:CONTAINS{quantityOrdered: event.quantities[index]}]->(order)\n", + " ''')\n", + " .save())" + ], + "metadata": { + "id": "cm3RQyLbbjue" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "What we expect now is that for the two actors there are two orders one per each, then each order contains three products." + ], + "metadata": { + "id": "61LUSN4F6pQq" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (a:Person)-[:CREATED]->(o:Order)<-[c:CONTAINS]-(p:Product)\n", + "WHERE a.name IN ['Cuba Gooding Jr.', 'Tom Hanks']\n", + "RETURN a.name, o.id, o.date, p.name, c.quantityOrdered" + ], + "metadata": { + "id": "8fVMQiTf61mN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Exercise\n", + "\n", + "Given `neo4j_resources_df` build a small Knowledge Graph in Neo4j with the following structure:\n", + "\n", + "```cypher\n", + "(:Author{name})-[:CREATED]->(:Resource{name})-[:HAS_TAG]->(:Tag{name})\n", + "```" + ], + "metadata": { + "id": "pn7IM8me9R3I" + } + }, + { + "cell_type": "code", + "source": [ + "neo4j_resources_df = spark.createDataFrame([\n", + " {'author': 'LARUS Business Automation', 'resource': 'Galileo.XAI', 'tags': ['Graph Machine Learning', 'Neo4j', 'Explainable AI', 'Artificial Intelligence']},\n", + " {'author': 'Neo4j', 'resource': 'Graph Data Science Library', 'tags': ['Graph Machine Learning', 'Algorithms']},\n", + " {'author': 'Michael Hunger', 'resource': 'APOC', 'tags': ['Graph Data Integration', 'Graph Algorithms']}\n", + "])" + ], + "metadata": { + "id": "_wmoLl8d9RVz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "neo4j_resources_df" + ], + "metadata": { + "id": "NkG1jCynLXgJ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# write your spark code that persist neo4j_resources_df here" + ], + "metadata": { + "id": "8EQmY-qhsbi1" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "(neo4j_resources_df.write\n", + " .format('org.neo4j.spark.DataSource')\n", + " .mode('overwrite')\n", + " .option('query', '''\n", + " MERGE (a:Author {name: event.author})\n", + " MERGE (r:Resource {name: event.resource})\n", + " MERGE (a)-[:CREATED]->(r)\n", + " WITH a, r, event\n", + " UNWIND event.tags AS tag\n", + " MERGE (t:Tag{name: tag})\n", + " MERGE (r)-[:HAS_TAG]->(t)\n", + " ''')\n", + " .save())\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "KCfw_saanywo" + } + }, + { + "cell_type": "code", + "source": [ + "\"\"\"\n", + " This paragraph is for validating the code the you\n", + " wrote above, please execute it after you\n", + " persisted neo4j_resources_df as Cypher query\n", + "\"\"\"\n", + "\n", + "with neo4j_driver.session() as session:\n", + " # count relationships\n", + " def check_graph_consistency(tx):\n", + " result = tx.run('''\n", + " MATCH (a:Author)-[:CREATED]->(r:Resource)-[:HAS_TAG]->(t:Tag)\n", + " RETURN a.name AS author, r.name AS resource, collect(t.name) AS tags\n", + " ORDER By author\n", + " ''')\n", + " return [{'author': record['author'], 'resource': record['resource'], 'tags': set(record['tags'])} for record in result]\n", + "\n", + " actual = session.read_transaction(check_graph_consistency)\n", + " expected = [\n", + " {'author': 'LARUS Business Automation', 'resource': 'Galileo.XAI', 'tags': frozenset(['Graph Machine Learning', 'Neo4j', 'Explainable AI', 'Artificial Intelligence'])},\n", + " {'author': 'Michael Hunger', 'resource': 'APOC', 'tags': frozenset(['Graph Data Integration', 'Graph Algorithms'])},\n", + " {'author': 'Neo4j', 'resource': 'Graph Data Science Library', 'tags': frozenset(['Graph Machine Learning', 'Algorithms'])}\n", + " ]\n", + " assert actual == expected\n", + "\n", + "print(\"All assertion are successfuly satisfied. Congrats you saved your first Knowledge Graph DataFrame into Neo4j!\")" + ], + "metadata": { + "id": "LwqbSsEcsgDi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "_LbqufNZMj6-" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/neo4j_data_science.ipynb b/examples/neo4j_data_science.ipynb index 673cbdee4..391485aa1 100644 --- a/examples/neo4j_data_science.ipynb +++ b/examples/neo4j_data_science.ipynb @@ -1,1994 +1,2001 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "Open this notebook in Google Colab \n", - " \"Open\n", - "" - ], - "metadata": { - "id": "ciNaixnkx1vj" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Example of a Simple data science workflow with Neo4j and Spark" - ], - "metadata": { - "id": "zADiJjnuVfq2" - } - }, - { - "cell_type": "markdown", - "source": [ - "This notebook contains a set of examples that explains how the Neo4j Spark connector can fit in you Data Scinece workflow, how you can combine Spark Neo4j and the Graph Data Science library to extract insights from your data and mostly important it allows you to test your knowledge with a set of exercises after each section.\n", - "\n", - "If you have any questions or problems feel free to write a post in the [Neo4j community forum](https://community.neo4j.com/) or in [Discord](https://discord.com/invite/neo4j).\n", - "\n", - "If you want more exercises feel free to open an issue in the [GitHub repository](https://github.com/neo4j/neo4j-spark-connector).\n", - "\n", - "Enjoy!" - ], - "metadata": { - "id": "nLucMn17V0YK" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Notes about this notebook\n", - "\n", - "This code contains a simple data science workflow that combines Neo4j's Graph Data Science Library with the Neo4j Connector for Apache Spark.\n", - "\n", - "Going forward you'll find code examples in:\n", - "\n", - "* PySpark\n", - "* PySpark Pandas\n", - "\n", - "You can choose to navigate by using one of them, or both, but we suggest you do one at time to ensure you understand the APIs." - ], - "metadata": { - "id": "pWWY8190RB98" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Create the sandbox instance\n", - "\n", - "You can easily spin-up a Neo4j sandbox by click [here](https://sandbox.neo4j.com/?usecase=fraud-detection)\n", - "\n", - "After that you'll be redirect in a webpage like this:\n", - "\n", - "\n", - "\n", - "Please click in the **Connection details tab** and copy your connection parameters into the Python variables below" - ], - "metadata": { - "id": "3hCQBmBKVaHm" - } - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ttxf62TPVP-w" - }, - "outputs": [], - "source": [ - "neo4j_url = \"\" # put your neo4j url here" - ] - }, - { - "cell_type": "code", - "source": [ - "neo4j_user = \"neo4j\" # put your neo4j user here" - ], - "metadata": { - "id": "-lPr1hfIGtfL" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "neo4j_password = \"\" # put your neo4j password here" - ], - "metadata": { - "id": "yoI29jjvGvlX" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Configure the Spark Environment" - ], - "metadata": { - "id": "Capd99x5G2rm" - } - }, - { - "cell_type": "code", - "source": [ - "spark_version = '3.3.4'" - ], - "metadata": { - "id": "OiHMiko1-Qf7" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!apt-get install openjdk-17-jdk-headless -qq > /dev/null" - ], - "metadata": { - "id": "qdjzLBDzGx5l" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!wget -q https://dlcdn.apache.org/spark/spark-$spark_version/spark-$spark_version-bin-hadoop3.tgz" - ], - "metadata": { - "id": "7JT9OKhzG7Lq" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "A3gsnSHl0F99" - }, - "outputs": [], - "source": [ - "!tar xf spark-$spark_version-bin-hadoop3.tgz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hSBQWKs90vSx" - }, - "outputs": [], - "source": [ - "!pip install -q findspark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tnW0a1Gj080k" - }, - "outputs": [], - "source": [ - "import os\n", - "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-17-openjdk-amd64\"\n", - "os.environ[\"SPARK_HOME\"] = f\"/content/spark-{spark_version}-bin-hadoop3\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dlUBSezK1DpZ" - }, - "outputs": [], - "source": [ - "import findspark\n", - "findspark.init()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dOUJ-W871Tur" - }, - "outputs": [], - "source": [ - "from pyspark.sql import SparkSession\n", - "spark = (SparkSession.builder\n", - " .master('local[*]')\n", - " .appName('Data science workflow with Neo4j and Spark')\n", - " .config('spark.ui.port', '4050')\n", - " # Just to show dataframes as tables\n", - " #.config('spark.sql.repl.eagerEval.enabled', False)\n", - " .config('spark.jars.packages', 'org.neo4j:neo4j-connector-apache-spark_2.12:5.1.0_for_spark_3')\n", - " # As we're using always the same database instance we'll\n", - " # define them as global variables\n", - " # so we don't need to repeat them each time\n", - " .config(\"neo4j.url\", neo4j_url)\n", - " .config(\"neo4j.authentication.type\", \"basic\")\n", - " .config(\"neo4j.authentication.basic.username\", neo4j_user)\n", - " .config(\"neo4j.authentication.basic.password\", neo4j_password)\n", - " .getOrCreate())\n", - "spark" - ] - }, - { - "cell_type": "code", - "source": [ - "# import utility functions that we'll use in the notebook\n", - "from pyspark.sql.types import *\n", - "from pyspark.sql.functions import *" - ], - "metadata": { - "id": "pghCcGnJWcZQ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Import PySpark Pandas\n", - "\n", - "Pandas API on Apache Spark (PySpark) enables data scientists and data engineers to run their existing Pandas code on Spark. Prior to this API, you had to do a significant code rewrite from Pandas DataFrame to PySpark DataFrame which is time-consuming and error-prone.\n", - "\n", - "In this notebook we'll use both PySpark Dataframes and and PySpark Pandas.\n", - "\n", - "The only thing that we need to do is to import the library using the statement below." - ], - "metadata": { - "id": "klQ2Ah6CFBV1" - } - }, - { - "cell_type": "code", - "source": [ - "import pyspark.pandas as ps" - ], - "metadata": { - "id": "lDkBcHySCBT0" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "\n", - "## Exercises prerequisite\n", - "\n", - "In this notebook we and going to test your knowledge. Some of the exercises require the Neo4j Python driver to check if the exercises are being solved correctly.\n", - "\n", - "*Neo4j Python Driver is required only for verifying the exercises when you persist data from Spark to Neo4j*\n", - "\n", - "**It's not required by the Spark connector!!!**\n", - "\n", - "We'll use [Cy2Py](https://github.com/conker84/cy2py), a Jupyter extension that easily allows you to connect to Neo4j and visualize data from Jupyter notebooks.\n", - "For a detailed instruction about how to use it please dive into [this example](https://github.com/conker84/cy2py/blob/main/examples/Neo4j_Crime_Investigation_Dataset.ipynb)" - ], - "metadata": { - "id": "b6_YNZnZ5GdT" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install -q cy2py" - ], - "metadata": { - "id": "f5ZZJylo5Bbz" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "let's load the extension" - ], - "metadata": { - "id": "uKYEPEgOcG2b" - } - }, - { - "cell_type": "code", - "source": [ - "%load_ext cy2py" - ], - "metadata": { - "id": "38EeXF6icKOK" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "You can query the database via **cy2py** in this simple way" - ], - "metadata": { - "id": "peqcEHj0b35T" - } - }, - { - "cell_type": "code", - "source": [ - "# define the colors for the nodes\n", - "colors = {\n", - " ':Client': '#D18711',\n", - " ':Bank': '#0541B2',\n", - " ':Merchant': '#9E14AA',\n", - " ':Mule': '#6113A3',\n", - " ':CashIn': '#328918',\n", - " ':CashOut': '#C1A23D',\n", - " ':Debit': '#A32727',\n", - " ':Payment': '#3B80C4',\n", - " ':Transfer': '#088472',\n", - " ':Transaction': '#D10B4F',\n", - " ':Email': '#EA5D1E',\n", - " ':SSN': '#707070',\n", - " ':Phone': '#4B4444',\n", - "}" - ], - "metadata": { - "id": "dw2P-XpfLCJY" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "%%cypher -u $neo4j_url -us $neo4j_user -pw $neo4j_password -co $colors\n", - "CALL apoc.meta.graph()" - ], - "metadata": { - "id": "BfFOTNkncMqp" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Problem Definition\n" - ], - "metadata": { - "id": "d-x29ClTPBnv" - } - }, - { - "cell_type": "markdown", - "source": [ - "## What is Fraud?\n", - "Fraud occurs when an individual or group of individuals, or a business entity intentionally deceives another individual or business entity with misrepresentation of identity, products, services, or financial transactions and/or false promises with no intention of fulfilling them." - ], - "metadata": { - "id": "79q5QJfcPMa6" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Fraud Categories\n" - ], - "metadata": { - "id": "naUmXhC-PQGR" - } - }, - { - "cell_type": "markdown", - "source": [ - "### First-party Fraud\n", - "An individual, or group of individuals, misrepresent their identity or give false information when applying for a product or services to receive more favourable rates or when have no intention of repayment." - ], - "metadata": { - "id": "edTfWFSAPUKF" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Second-party Fraud\n", - "An individual knowingly gives their identity or personal information to another individual to commit fraud or someone is perpetrating fraud in his behalf." - ], - "metadata": { - "id": "Zr9sGs_9PYmH" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Third-party Fraud\n", - "An individual, or a group of individuals, create or use another person’s identity, or personal details, to open or takeover an account." - ], - "metadata": { - "id": "o45K16ryPcBu" - } - }, - { - "cell_type": "markdown", - "source": [ - "## The dataset\n", - "\n", - "We will use Paysim dataset for the hands-on exercises. Paysim is a synthetic dataset that mimics real world mobile money transfer network.\n", - "\n", - "For more information on the dataset, please visit this [blog page](https://www.sisu.io/posts/paysim/)" - ], - "metadata": { - "id": "dFje1N1cPq_9" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "CALL apoc.meta.graph()" - ], - "metadata": { - "id": "AAeicV33PDXa" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "There are five types of transactions in this database. List all transaction types and corresponding metrics by iterating over all the transactions." - ], - "metadata": { - "id": "Ux5tg_OzUvgT" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark" - ], - "metadata": { - "id": "viWCvG1MU632" - } - }, - { - "cell_type": "code", - "source": [ - "transaction_df = (spark.read\n", - " .format('org.neo4j.spark.DataSource')\n", - " .option('labels', ':Transaction')\n", - " .load())\n", - "\n", - "transaction_df_count = transaction_df.count()\n", - "\n", - "transaction_df = (transaction_df.groupBy('')\n", - " .count()\n", - " .withColumnRenamed('', 'transaction'))\n", - "\n", - "transaction_df = (transaction_df\n", - " .withColumn('transaction', transaction_df['transaction'].getItem(0))\n", - " .withColumn('% transactions', transaction_df['count'] / transaction_df_count))\n", - "\n", - "transaction_df.show(truncate=False)" - ], - "metadata": { - "id": "xsIlmR-EQLeb" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark Pandas" - ], - "metadata": { - "id": "U2JUpe4NU6Jz" - } - }, - { - "cell_type": "code", - "source": [ - "transaction_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\"labels\": \"Transaction\"})\n", - "\n", - "transaction_ps_count = transaction_ps.count()[0] * 1.0\n", - "\n", - "transaction_ps = (transaction_ps.groupby([''])\n", - " .size()\n", - " .reset_index(name='% transactions'))\n", - "\n", - "transaction_ps = transaction_ps.rename(columns={'': 'label'})\n", - "\n", - "transaction_ps['% transactions'] = transaction_ps['% transactions'].astype(float).div(transaction_ps_count * 1.0)\n", - "\n", - "transaction_ps.label = [x[0] for x in transaction_ps.label.to_numpy()]\n", - "\n", - "transaction_ps" - ], - "metadata": { - "id": "rfkeCU0PVFXx" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "##### Plot the data\n", - "You can also use Python libraries like [Ploty](https://plotly.com/python/) to plot results" - ], - "metadata": { - "id": "t2RufDichKkQ" - } - }, - { - "cell_type": "code", - "source": [ - "import plotly.express as px\n", - "\n", - "# we use to_pandas() in order to transform the PySpark Pandas to a real Pandas Dataframe\n", - "fig = px.pie(transaction_ps.to_pandas(), values='% transactions', names='label')\n", - "\n", - "fig.show()" - ], - "metadata": { - "id": "Wn4HMYGVhV1t" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Exploit first-party Fraud\n", - "\n", - "Synthetic identity fraud and first party fraud can be identified by performing entity link analysis to detect identities linked to other identities via shared PII.\n", - "\n", - "There are three types of personally identifiable information (PII) in this dataset - SSN, Email and Phone Number\n", - "\n", - "Our hypothesis is that clients who share identifiers are suspicious and have a higher potential to commit fraud. However, all shared identifier links are not suspicious, for example, two people sharing an email address. Hence, we compute a fraud score based on shared PII relationships and label the top X percentile clients as fraudsters.\n", - "\n", - "We will first identify clients that share identifiers and create a new relationship between clients that share identifiers" - ], - "metadata": { - "id": "VwPKEtu2QLlv" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Enrich the dataset" - ], - "metadata": { - "id": "E162NudWkW2n" - } - }, - { - "cell_type": "markdown", - "source": [ - "In order to perfrorm our investigation we want to enrich the base dataset by identifing clients that share PII." - ], - "metadata": { - "id": "Y3MfFaKqH7Lm" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (c1:Client)-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(n)<-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]-(c2:Client)\n", - "WHERE id(c1) < id(c2)\n", - "RETURN c1.id, c2.id, count(*) AS freq\n", - "ORDER BY freq DESC;" - ], - "metadata": { - "id": "zJfRBlNNP9A1" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Now we can reuse the same Cypher query for creating our Dataframe and then use the Neo4j Spark Connector to create a new `SHARED_IDENTIFIERS` relationship betwen two clients:\n", - "\n", - "**(:Client)-[:SHARED_IDENTIFIERS]->(:Client)**\n", - "\n" - ], - "metadata": { - "id": "4J6d8U8bkMW_" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "// let's check if there relationships are in there\n", - "MATCH (c:Client)-[r:SHARED_IDENTIFIERS]->(c2:Client)\n", - "RETURN *\n", - "LIMIT 10" - ], - "metadata": { - "id": "sQ7Nf_IUQQ1J" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "As you can see there are no relationships in the database" - ], - "metadata": { - "id": "6zUwaYxgQhbC" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Code in PySpark" - ], - "metadata": { - "id": "7CAiMFJ3LPmP" - } - }, - { - "cell_type": "code", - "source": [ - "shared_identifiers_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", - " .option(\"query\", \"\"\"\n", - " MATCH (c1:Client)-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(n)<-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]-(c2:Client)\n", - " WHERE id(c1) < id(c2)\n", - " RETURN c1.id AS source, c2.id AS target, count(*) AS freq\n", - " \"\"\")\n", - " .load())\n", - "\n", - "(shared_identifiers_df.write\n", - " .format(\"org.neo4j.spark.DataSource\")\n", - " .mode(\"Overwrite\")\n", - " .option(\"relationship\", \"SHARED_IDENTIFIERS\")\n", - " .option(\"relationship.save.strategy\", \"keys\")\n", - " .option(\"relationship.source.labels\", \":Client\")\n", - " .option(\"relationship.source.save.mode\", \"Overwrite\")\n", - " .option(\"relationship.source.node.keys\", \"source:id\")\n", - " .option(\"relationship.target.labels\", \":Client\")\n", - " .option(\"relationship.target.node.keys\", \"target:id\")\n", - " .option(\"relationship.target.save.mode\", \"Overwrite\")\n", - " .option(\"relationship.properties\", \"freq:count\")\n", - " .save())" - ], - "metadata": { - "id": "36irb2nuj5Hi" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Code in PySpark Pandas" - ], - "metadata": { - "id": "h07CRHnlLU-G" - } - }, - { - "cell_type": "code", - "source": [ - "shared_identifiers_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\"query\": \"\"\"\n", - " MATCH (c1:Client)-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(n)<-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]-(c2:Client)\n", - " WHERE id(c1) < id(c2)\n", - " RETURN c1.id AS source, c2.id AS target, count(*) AS freq\n", - "\"\"\"})\n", - "\n", - "shared_identifiers_ps.spark.to_spark_io(format=\"org.neo4j.spark.DataSource\", mode=\"Overwrite\", options={\n", - " \"relationship\": \"SHARED_IDENTIFIERS\",\n", - " \"relationship.save.strategy\": \"keys\",\n", - " \"relationship.source.labels\": \":Client\",\n", - " \"relationship.source.save.mode\": \"Overwrite\",\n", - " \"relationship.source.node.keys\": \"source:id\",\n", - " \"relationship.target.labels\": \":Client\",\n", - " \"relationship.target.node.keys\": \"target:id\",\n", - " \"relationship.target.save.mode\": \"Overwrite\",\n", - " \"relationship.properties\": \"freq:count\"\n", - "})" - ], - "metadata": { - "id": "vZ6So9x_MBY_" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "// let's check (again) if there relationships are in there\n", - "MATCH (c:Client)-[r:SHARED_IDENTIFIERS]->(c2:Client)\n", - "RETURN *\n", - "LIMIT 10" - ], - "metadata": { - "id": "A6iUPMYMQAzF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Build Fraud detection workflow in Neo4j GDS\n", - "\n", - "We will construct a workflow with graph algorithms to detect fraud rings, score clients based on the number of common connections and rank them to select the top few suspicious clients and label them as fraudsters.\n", - "\n", - "1. Identify clusters of clients sharing PII using a community detection algorithm (Weakly Connected Components)\n", - "2. Find similar clients within the clusters using pairwise similarity algorithms (Node Similarity)\n", - "3. Calculate and assign fraud score to clients using centrality algorithms (Degree Centrality)\n", - "4. Use computed fraud scores to label clients as potential fraudsters" - ], - "metadata": { - "id": "Fy8roSDHQw3h" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Identify groups of clients sharing PII (Fraud rings)\n", - "\n", - "Run Weakly connected components to find clusters of clients sharing PII.\n", - "\n", - "Weakly Connected Components is used to find groups of connected nodes, where all nodes in the same set form a connected component. WCC is often used early in an analysis understand the structure of a graph. More informaton here: [WCC documentation](https://neo4j.com/docs/graph-data-science/current/algorithms/wcc/)" - ], - "metadata": { - "id": "bysMYQ23WFVl" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Create a graph projection\n", - "\n", - "A central concept in the GDS library is the management of in-memory graphs. Graph algorithms run on a graph data model which is a projection of the Neo4j property graph data model. For more information, please click here: [Graph Management](https://neo4j.com/docs/graph-data-science/current/management-ops/)\n", - "\n", - "A projected graph can be stored in the catalog under a user-defined name. Using that name, the graph can be referred to by any algorithm in the library." - ], - "metadata": { - "id": "39g6Fq1dTgLt" - } - }, - { - "cell_type": "markdown", - "source": [ - "Consider that the original Cypher query is the following:\n", - "```cypher\n", - "CALL gds.graph.project('wcc',\n", - " {\n", - " Client: {\n", - " label: 'Client'\n", - " }\n", - " },\n", - " {\n", - " SHARED_IDENTIFIERS:{\n", - " type: 'SHARED_IDENTIFIERS',\n", - " orientation: 'UNDIRECTED',\n", - " properties: {\n", - " count: {\n", - " property: 'count'\n", - " }\n", - " }\n", - " }\n", - " }\n", - ") YIELD graphName,nodeCount,relationshipCount,projectMillis;\n", - "```\n", - "\n", - "which will be translate into:" - ], - "metadata": { - "id": "fXQwdpJfVGIq" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark" - ], - "metadata": { - "id": "-fkvDjHcUV5Z" - } - }, - { - "cell_type": "code", - "source": [ - "wcc_graph_proj_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", - " .option(\"gds\", \"gds.graph.project\")\n", - " .option(\"gds.graphName\", \"wcc\")\n", - " .option(\"gds.nodeProjection.Client.label\", \"Client\")\n", - " .option(\"gds.relationshipProjection.SHARED_IDENTIFIERS.type\", \"SHARED_IDENTIFIERS\")\n", - " .option(\"gds.relationshipProjection.SHARED_IDENTIFIERS.orientation\", \"UNDIRECTED\")\n", - " .option(\"gds.relationshipProjection.SHARED_IDENTIFIERS.properties.count.property\", \"count\")\n", - " .load())\n", - "\n", - "wcc_graph_proj_df.show(truncate=False)" - ], - "metadata": { - "id": "4dBHWHh8R7US" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark Pandas" - ], - "metadata": { - "id": "cauhX4FRVYMy" - } - }, - { - "cell_type": "code", - "source": [ - "wcc_graph_proj_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", - " \"gds\": \"gds.graph.project\",\n", - " \"gds.graphName\": \"wcc\",\n", - " \"gds.nodeProjection.Client.label\": \"Client\",\n", - " \"gds.relationshipProjection.SHARED_IDENTIFIERS.type\": \"SHARED_IDENTIFIERS\",\n", - " \"gds.relationshipProjection.SHARED_IDENTIFIERS.orientation\": \"UNDIRECTED\",\n", - " \"gds.relationshipProjection.SHARED_IDENTIFIERS.properties.count.property\": \"count\"\n", - "})\n", - "\n", - "wcc_graph_proj_ps" - ], - "metadata": { - "id": "ZndBGbXPVeqU" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Run the WCC algorithm\n", - "\n", - "The original Cypher query is:\n", - "\n", - "```cypher\n", - "CALL gds.wcc.stream('wcc',\n", - " {\n", - " nodeLabels: ['Client'],\n", - " relationshipTypes: ['SHARED_IDENTIFIERS'],\n", - " consecutiveIds: true\n", - " }\n", - ")\n", - "YIELD nodeId, componentId\n", - "RETURN gds.util.asNode(nodeId).id AS clientId, componentId\n", - "ORDER BY componentId\n", - "LIMIT 20\n", - "```\n", - "\n", - "which is transate into:" - ], - "metadata": { - "id": "P4oIKsUNn-ZH" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark" - ], - "metadata": { - "id": "Ygw7T3lSWbsQ" - } - }, - { - "cell_type": "code", - "source": [ - "# get the clients\n", - "clients_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", - " .option(\"labels\", \"Client\")\n", - " .load())\n", - "\n", - "# invoke the gds wcc stream procedure\n", - "wcc_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", - " .option(\"gds\", \"gds.wcc.stream\")\n", - " .option(\"gds.graphName\", \"wcc\")\n", - " .option(\"gds.nodeLabels\", \"['Client']\")\n", - " .option(\"gds.relationshipTypes\", \"['SHARED_IDENTIFIERS']\")\n", - " .option(\"gds.consecutiveIds\", \"true\")\n", - " .load())\n", - "\n", - "# join the two dataframes and show id, componentId\n", - "client_component_df = (clients_df.join(wcc_df, clients_df[\"\"] == wcc_df[\"nodeId\"], \"inner\")\n", - " .select(\"id\", \"componentId\"))\n", - "\n", - "client_component_df.show(truncate=False)" - ], - "metadata": { - "id": "6RtkJV9GWHnu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark Pandas" - ], - "metadata": { - "id": "xQI9gQNQYMWe" - } - }, - { - "cell_type": "code", - "source": [ - "# get the clients\n", - "clients_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\"labels\": \"Client\"})\n", - "\n", - "# invoke the gds wcc stream procedure\n", - "wcc_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", - " \"gds\": \"gds.wcc.stream\",\n", - " \"gds.graphName\": \"wcc\",\n", - " \"gds.nodeLabels\": \"['Client']\",\n", - " \"gds.relationshipTypes\": \"['SHARED_IDENTIFIERS']\",\n", - " \"gds.consecutiveIds\": \"true\"\n", - "})\n", - "\n", - "# join the two pandas df and show id, componentId\n", - "client_component_ps = clients_ps.join(wcc_ps.set_index(\"nodeId\"), on=\"\")[[\"id\", \"componentId\"]]\n", - "\n", - "# we show only the first 20\n", - "client_component_ps[:20]" - ], - "metadata": { - "id": "AvlUnpIQYQsZ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Write results to the database.\n", - "Now that we identified clusters of clients sharing PII, we want to store these results back into the database by enriching the `Client` node.\n", - "We'll add the component id of the cluster as `firstPartyFraudGroup` property" - ], - "metadata": { - "id": "eNgmAuheZqfA" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark" - ], - "metadata": { - "id": "EKNcuklDaRKY" - } - }, - { - "cell_type": "code", - "source": [ - "(client_component_df\n", - " .withColumnRenamed(\"componentId\", \"firstPartyFraudGroup\")\n", - " .write\n", - " .format(\"org.neo4j.spark.DataSource\")\n", - " .mode(\"Overwrite\")\n", - " .option(\"labels\", \"Client\")\n", - " .option(\"node.keys\", \"id\")\n", - " .save())" - ], - "metadata": { - "id": "yQb0H-p7ZrRP" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark Pandas" - ], - "metadata": { - "id": "-HK28CGoa5_p" - } - }, - { - "cell_type": "code", - "source": [ - "(client_component_ps\n", - " .rename(columns={\"componentId\": \"firstPartyFraudGroup\"})\n", - " .spark\n", - " .to_spark_io(format=\"org.neo4j.spark.DataSource\", mode=\"Overwrite\", options={\n", - " \"labels\": \"Client\",\n", - " \"node.keys\": \"id\"\n", - " }))" - ], - "metadata": { - "id": "zJDS-0bta8_s" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "// Visualize clusters with greater than 9 client nodes.\n", - "MATCH (c:Client)\n", - "WITH c.firstPartyFraudGroup AS fpGroupID, collect(c.id) AS fGroup\n", - "WITH *, size(fGroup) AS groupSize WHERE groupSize >= 9\n", - "WITH * LIMIT 1\n", - "MATCH p=(c:Client)-[:HAS_SSN|HAS_EMAIL|HAS_PHONE]->()\n", - "WHERE c.firstPartyFraudGroup = fpGroupID\n", - "RETURN p" - ], - "metadata": { - "id": "oOsrNUZocx21" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Compute pairwise similarity scores\n", - "\n", - "We use node similarity algorithm to find similar nodes based on the relationships to other nodes. Node similarity uses Jaccard metric ([Node Similarity](https://neo4j.com/docs/graph-data-science/current/algorithms/node-similarity/#algorithms-node-similarity))\n", - "\n", - "Node similarity algorithms work on bipartite graphs (two types of nodes and relationships between them). Here we project client nodes (one type) and three identifiers nodes (that are considered as second type) into memory." - ], - "metadata": { - "id": "5CCwYp1FfoMU" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Project the graph\n", - "\n", - "The original Cypher query is\n", - "\n", - "```cypher\n", - "MATCH(c:Client) WHERE c.firstPartyFraudGroup is not NULL\n", - "WITH collect(c) as clients\n", - "MATCH(n) WHERE n:Email OR n:Phone OR n:SSN\n", - "WITH clients, collect(n) as identifiers\n", - "WITH clients + identifiers as nodes\n", - "\n", - "MATCH(c:Client) -[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(id)\n", - "WHERE c.firstPartyFraudGroup is not NULL\n", - "WITH nodes, collect({source: c, target: id}) as relationships\n", - "\n", - "CALL gds.graph.project.cypher('similarity',\n", - " \"UNWIND $nodes as n RETURN id(n) AS id,labels(n) AS labels\",\n", - " \"UNWIND $relationships as r RETURN id(r['source']) AS source, id(r['target']) AS target, 'HAS_IDENTIFIER' as type\",\n", - " { parameters: {nodes: nodes, relationships: relationships}}\n", - ")\n", - "YIELD graphName, nodeCount, relationshipCount, projectMillis\n", - "RETURN graphName, nodeCount, relationshipCount, projectMillis\n", - "```\n", - "\n", - "Which is translated into" - ], - "metadata": { - "id": "aLGQxFtpnQHa" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark" - ], - "metadata": { - "id": "aPSY4htNgLVG" - } - }, - { - "cell_type": "code", - "source": [ - "similarity_graph_proj_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", - " .option(\"gds\", \"gds.graph.project.cypher\")\n", - " .option(\"gds.graphName\", \"similarity\")\n", - " .option(\"gds.nodeQuery\", \"\"\"\n", - " MATCH (n)\n", - " WHERE (n:Client AND n.firstPartyFraudGroup is not NULL) OR n:Email OR n:Phone OR n:SSN\n", - " RETURN id(n) AS id, labels(n) AS labels\n", - " \"\"\")\n", - " .option(\"gds.relationshipQuery\", \"\"\"\n", - " MATCH (s:Client)-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(t)\n", - " WHERE s.firstPartyFraudGroup is not NULL\n", - " RETURN id(s) AS source, id(t) AS target, 'HAS_IDENTIFIER' as type\n", - " \"\"\")\n", - " .load())\n", - "\n", - "similarity_graph_proj_df.show(truncate=False)" - ], - "metadata": { - "id": "eNvVkJTRfuqM" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark Pandas" - ], - "metadata": { - "id": "PcJCKDDhmbtr" - } - }, - { - "cell_type": "code", - "source": [ - "similarity_graph_proj_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", - " \"gds\": \"gds.graph.project.cypher\",\n", - " \"gds.graphName\": \"similarity\",\n", - " \"gds.nodeQuery\": \"\"\"\n", - " MATCH (n)\n", - " WHERE (n:Client AND n.firstPartyFraudGroup is not NULL) OR n:Email OR n:Phone OR n:SSN\n", - " RETURN id(n) AS id, labels(n) AS labels\n", - " \"\"\",\n", - " \"gds.relationshipQuery\": \"\"\"\n", - " MATCH (s:Client)-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(t)\n", - " WHERE s.firstPartyFraudGroup is not NULL\n", - " RETURN id(s) AS source, id(t) AS target, 'HAS_IDENTIFIER' as type\n", - " \"\"\"\n", - "})\n", - "\n", - "similarity_graph_proj_ps" - ], - "metadata": { - "id": "odriamHwmiEQ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Compute the node similarity\n", - "\n", - "We can mutate in-memory graph by writing outputs from the algorithm as node or relationship properties.\n", - "\n", - "In this particular case all the procedures with `mutate` and `write` suffix are not supported from the Neo4j Spark Connector, in this case we'll write a Cypher query:\n" - ], - "metadata": { - "id": "TfnVREvvmzG4" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "CALL gds.nodeSimilarity.mutate('similarity',\n", - " {\n", - " topK:15,\n", - " mutateProperty: 'jaccardScore',\n", - " mutateRelationshipType:'SIMILAR_TO'\n", - " }\n", - ");" - ], - "metadata": { - "id": "E0_WoCLlnIr6" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Mutate mode is very fast compared to write mode and it helps in optimizing algorithm execution times, then we write back the property from in-memory graph to the database and use it for further analysis:" - ], - "metadata": { - "id": "JnC3C7urPyN_" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "CALL gds.graph.writeRelationship('similarity', 'SIMILAR_TO', 'jaccardScore');" - ], - "metadata": { - "id": "xajV8enLPvIN" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Exercise: Calculate First-party Fraud Score\n", - "\n", - "We compute first party fraud score using weighted degree centrality algorithm.\n", - "\n", - "In this step, we compute and assign fraud score (`firstPartyFraudScore`) to clients in the clusters identified in previous steps based on `SIMILAR_TO` relationships weighted by `jaccardScore`\n", - "\n", - "Weighted degree centrality algorithm add up similarity scores (`jaccardScore`) on the incoming `SIMILAR_TO` relationships for a given node in a cluster and assign the sum as the corresponding `firstPartyFraudScore`. This score represents clients who are similar to many others in the cluster in terms of sharing identifiers. Higher `firstPartyFraudScore` represents greater potential for committing fraud." - ], - "metadata": { - "id": "qpXXoWeIQk9U" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Code in PySpark" - ], - "metadata": { - "id": "fAp_acV-RBOu" - } - }, - { - "cell_type": "code", - "source": [ - "# invoke the gds.degree.stream procedure" - ], - "metadata": { - "id": "ZPAHXvT6Qouy" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "similarity_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", - " .option(\"gds\", \"gds.degree.stream\")\n", - " .option(\"gds.graphName\", \"similarity\")\n", - " .option(\"gds.nodeLabels\", \"['Client']\")\n", - " .option(\"gds.relationshipTypes\", \"['SIMILAR_TO']\")\n", - " .option(\"gds.relationshipWeightProperty\", \"jaccardScore\")\n", - " .load())\n", - "\n", - "# join the two dataframes and show id, score\n", - "client_similarity_df = (clients_df.join(similarity_df, clients_df[\"\"] == similarity_df[\"nodeId\"], \"inner\")\n", - " .select(\"id\", \"score\")\n", - " .withColumnRenamed(\"score\", \"firstPartyFraudScore\"))\n", - "\n", - "# write the results back to the database\n", - "(client_similarity_df.write.format('org.neo4j.spark.DataSource')\n", - " .mode(\"Overwrite\")\n", - " .option(\"labels\", \"Client\")\n", - " .option(\"node.keys\", \"id\")\n", - " .save())\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "1qKbMW3FSwLE" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Code in PySpark Pandas\n", - "\n" - ], - "metadata": { - "id": "na2h6VLov-ZA" - } - }, - { - "cell_type": "code", - "source": [ - "# invoke the gds.degree.stream procedure" - ], - "metadata": { - "id": "7ObUlkdmwDzi" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "similarity_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", - " \"gds\": \"gds.degree.stream\",\n", - " \"gds.graphName\": \"similarity\",\n", - " \"gds.nodeLabels\": \"['Client']\",\n", - " \"gds.relationshipTypes\": \"['SIMILAR_TO']\",\n", - " \"gds.relationshipWeightProperty\": \"jaccardScore\"\n", - "})\n", - "\n", - "# join the two pandas df and show id, score\n", - "client_similarity_ps = (clients_ps.join(similarity_ps.set_index(\"nodeId\"), on=\"\")[[\"id\", \"score\"]]\n", - " .rename(columns={\"score\": \"firstPartyFraudScore\"}))\n", - "\n", - "# write the results back to the database\n", - "client_similarity_ps.spark.to_spark_io(format=\"org.neo4j.spark.DataSource\", mode=\"Overwrite\", options={\n", - " \"labels\": \"Client\",\n", - " \"node.keys\": \"id\"\n", - "})\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "m8Cy4NtsS3nQ" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Verifiy the result\n", - "\n", - "We expect that:\n", - "- `similarity_df`/`similarity_ps`\n", - " - has two columns:\n", - " - `nodeId` of long type\n", - " - `score` of double type\n", - " - a count of **9134** rows\n", - "- `client_similarity_df`/`client_similarity_ps`\n", - " - has two columns:\n", - " - `id` of long type\n", - " - `score` of double type\n", - " - a count of 2433 rows" - ], - "metadata": { - "id": "I_TKq9TmU3cS" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Test PySpark Dataframe" - ], - "metadata": { - "id": "Dow9PWAtxBPu" - } - }, - { - "cell_type": "code", - "source": [ - "assert StructType([StructField(\"nodeId\", LongType()), StructField(\"score\", DoubleType())]) == similarity_df.schema\n", - "assert 9134 == similarity_df.count()\n", - "\n", - "assert StructType([StructField(\"id\", StringType()), StructField(\"firstPartyFraudScore\", DoubleType())]) == client_similarity_df.schema\n", - "assert 2433 == client_similarity_df.count()\n", - "print(\"All assertion are successfuly satisfied.\")" - ], - "metadata": { - "id": "5E6fUHwZU7PN" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Test PySpark Pandas" - ], - "metadata": { - "id": "feuIfTCqxGKt" - } - }, - { - "cell_type": "code", - "source": [ - "assert StructType([StructField(\"nodeId\", LongType()), StructField(\"score\", DoubleType())]) == similarity_ps.to_spark().schema\n", - "assert 9134 == similarity_ps.count()[0]\n", - "\n", - "assert StructType([StructField(\"id\", StringType()), StructField(\"firstPartyFraudScore\", DoubleType())]) == client_similarity_ps.to_spark().schema\n", - "assert 2433 == client_similarity_ps.count()[0]\n", - "print(\"All assertion are successfuly satisfied.\")" - ], - "metadata": { - "id": "OHel2EYoxFpC" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "We find clients with first-party fraud score greater than some threshold (X) and label those top X percentile clients as fraudsters. In this example, using 95th percentile as a threshold, we set a property FirstPartyFraudster on the Client node." - ], - "metadata": { - "id": "YBhfX5DYXsom" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (c:Client)\n", - "WHERE c.firstPartyFraudScore IS NOT NULL\n", - "WITH percentileCont(c.firstPartyFraudScore, 0.95) AS firstPartyFraudThreshold\n", - "MATCH (c:Client)\n", - "WHERE c.firstPartyFraudScore > firstPartyFraudThreshold\n", - "SET c:FirstPartyFraudster" - ], - "metadata": { - "id": "bSb4rZ-aXvYV" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Second-party Fraud / Money Mules\n", - "\n", - "The first step is to find out clients who weren't identified as first party fraudsters but they transact with first party fraudsters." - ], - "metadata": { - "id": "WyKyntZ_YC1g" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH p=(:Client:FirstPartyFraudster)-[]-(:Transaction)-[]-(c:Client)\n", - "WHERE NOT c:FirstPartyFraudster\n", - "RETURN p\n", - "LIMIT 50" - ], - "metadata": { - "id": "C94Yg_psYTRW" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Also, lets find out what types of transactions do these Clients perform with first party fraudsters" - ], - "metadata": { - "id": "Tm9WYZnwsQif" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (:Client:FirstPartyFraudster)-[]-(txn:Transaction)-[]-(c:Client)\n", - "WHERE NOT c:FirstPartyFraudster\n", - "UNWIND labels(txn) AS transactionType\n", - "RETURN transactionType, count(*) AS freq" - ], - "metadata": { - "id": "x2P9Y7IzY2Vl" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Create new relationships\n", - "\n", - "Let’s go ahead and create `TRANSFER_TO` relationships between clients with `firstPartyFraudster` tags and other clients. Also add the total amount from all such transactions as a property on `TRANSFER_TO` relationships.\n", - "\n", - "Since the total amount transferred from a fraudster to a client and the total amount transferred in the reverse direction are not the same, we have to create relationships in two separate queries.\n", - "\n", - "* `TRANSFER_TO` relationship from a fraudster to a client (look at the directions in queries)\n", - "* Add `SecondPartyFraudSuspect` tag to these clients" - ], - "metadata": { - "id": "f2HTap1vuLBW" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (c1:FirstPartyFraudster)-[]->(t:Transaction)-[]->(c2:Client)\n", - "WHERE NOT c2:FirstPartyFraudster\n", - "WITH c1, c2, sum(t.amount) AS totalAmount\n", - "SET c2:SecondPartyFraudSuspect\n", - "CREATE (c1)-[:TRANSFER_TO {amount:totalAmount}]->(c2)" - ], - "metadata": { - "id": "lfGkbJk2ueau" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "* `TRANSFER_TO` relationship from a client to a fraudster." - ], - "metadata": { - "id": "6yO2wJP6uksA" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH (c1:FirstPartyFraudster)<-[]-(t:Transaction)<-[]-(c2:Client)\n", - "WHERE NOT c2:FirstPartyFraudster\n", - "WITH c1, c2, sum(t.amount) AS totalAmount\n", - "SET c2:SecondPartyFraudSuspect\n", - "CREATE (c1)<-[:TRANSFER_TO {amount:totalAmount}]-(c2);" - ], - "metadata": { - "id": "WUeeP4MhujzM" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Visualize newly created `TRANSFER_TO` relationships" - ], - "metadata": { - "id": "68rcS85lutUx" - } - }, - { - "cell_type": "code", - "source": [ - "%%cypher\n", - "MATCH p=(:Client:FirstPartyFraudster)-[:TRANSFER_TO]-(c:Client)\n", - "WHERE NOT c:FirstPartyFraudster\n", - "RETURN p\n", - "LIMIT 50" - ], - "metadata": { - "id": "LC5kSXOGuu1w" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Goal\n", - "\n", - "Our objective is to find out clients who may have supported the first party fraudsters and were not identified as potential first party fraudsters.\n", - "\n", - "Our hypothesis is that clients who perform transactions of type `Transfer` where they either send or receive money from first party fraudsters are flagged as suspects for second party fraud.\n", - "\n", - "To identify such clients, make use of `TRANSFER_TO` relationships and use this recipe:\n", - "\n", - "* Use **WCC** (community detection) to identify networks of clients who are connected to first party fraudsters\n", - "* Use **PageRank** (centrality) to score clients based on their influence in terms of the amount of money transferred to/from fraudsters\n", - "* Assign risk score (`secondPartyFraudScore`) to these clients" - ], - "metadata": { - "id": "6swyguhivCEL" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Exercise: Project the graph\n", - "\n", - "Let’s use native projection and create an in-memory graph with Client nodes and TRANSFER_TO relationships.\n", - "\n", - "We want to project:\n", - "* `Client` for `nodeProjection`\n", - "* `TRANSFER_TO` for `relationshipProjection`\n", - "* for the configuration we want to set `relationshipProperties` to `amount`" - ], - "metadata": { - "id": "ILntBuwxvXCz" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Code in PySpark" - ], - "metadata": { - "id": "zgKufo0Z2C8e" - } - }, - { - "cell_type": "code", - "source": [ - "second_party_graph_proj_df = # insert your PySpark code here" - ], - "metadata": { - "id": "JmhR2nhwvWRu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "second_party_graph_proj_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", - " .option(\"gds\", \"gds.graph.project\")\n", - " .option(\"gds.graphName\", \"SecondPartyFraudNetwork\")\n", - " .option(\"gds.nodeProjection\", \"Client\")\n", - " .option(\"gds.relationshipProjection\", \"TRANSFER_TO\")\n", - " .option(\"gds.configuration.relationshipProperties\", \"amount\")\n", - " .load())\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "Ab6GEhJ_Tsrx" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Code in PySpark Pandas" - ], - "metadata": { - "id": "QHnI8W5J2Gcp" - } - }, - { - "cell_type": "code", - "source": [ - "second_party_graph_proj_ps = # insert your PySpark Pandas code here" - ], - "metadata": { - "id": "kD6vXaab3JdM" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "second_party_graph_proj_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", - " \"gds\": \"gds.graph.project\",\n", - " \"gds.graphName\": \"SecondPartyFraudNetwork\",\n", - " \"gds.nodeProjection\": \"Client\",\n", - " \"gds.relationshipProjection\": \"TRANSFER_TO\",\n", - " \"gds.configuration.relationshipProperties\": \"amount\"\n", - "})\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "c0VjV2rBTy10" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Verify the projection" - ], - "metadata": { - "id": "ZVul68Wi162b" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Test PySpark Dataframe" - ], - "metadata": { - "id": "jlOuUGA833ny" - } - }, - { - "cell_type": "code", - "source": [ - "second_party_graph_proj_df.cache()\n", - "\n", - "first_row = [\n", - " {\n", - " \"node\": list(row[\"nodeProjection\"].keys())[0],\n", - " \"rel\": list(row[\"relationshipProjection\"].keys())[0],\n", - " \"graphName\": row[\"graphName\"],\n", - " \"nodeCount\": row[\"nodeCount\"],\n", - " \"relCount\": row[\"relationshipCount\"]\n", - " } for row in second_party_graph_proj_df.collect()\n", - "][0]\n", - "\n", - "assert first_row == {\"node\": \"Client\", \"rel\": \"TRANSFER_TO\", \"graphName\": \"SecondPartyFraudNetwork\", \"nodeCount\": 2433, \"relCount\": 367}\n", - "print(\"All assertion are successfuly satisfied.\")" - ], - "metadata": { - "id": "hkjx6Nu339wW" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#### Test PySpark Pandas" - ], - "metadata": { - "id": "37mDMX8G4EPE" - } - }, - { - "cell_type": "code", - "source": [ - "second_party_graph_proj_ps_df = second_party_graph_proj_ps.to_spark()\n", - "\n", - "second_party_graph_proj_ps_df.cache()\n", - "\n", - "first_row = [\n", - " {\n", - " \"node\": list(row[\"nodeProjection\"].keys())[0],\n", - " \"rel\": list(row[\"relationshipProjection\"].keys())[0],\n", - " \"graphName\": row[\"graphName\"],\n", - " \"nodeCount\": row[\"nodeCount\"],\n", - " \"relCount\": row[\"relationshipCount\"]\n", - " } for row in second_party_graph_proj_ps_df.collect()\n", - "][0]\n", - "\n", - "assert first_row == {\"node\": \"Client\", \"rel\": \"TRANSFER_TO\", \"graphName\": \"SecondPartyFraudNetwork\", \"nodeCount\": 2433, \"relCount\": 367}\n", - "print(\"All assertion are successfuly satisfied.\")" - ], - "metadata": { - "id": "spF535cp4IcR" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "## Check clusters with more than one clients\n", - "We will see if there are any clusters with more than one clients in them and if there are, then we should add a tag `secondPartyFraudGroup` to find them later using local queries." - ], - "metadata": { - "id": "32FL1L324QxV" - } - }, - { - "cell_type": "markdown", - "source": [ - "### Code in PySpark" - ], - "metadata": { - "id": "UljCEN-s40mc" - } - }, - { - "cell_type": "code", - "source": [ - "# invoke gds.wcc.stream on\n", - "second_party_wcc_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", - " .option(\"gds\", \"gds.wcc.stream\")\n", - " .option(\"gds.graphName\", \"SecondPartyFraudNetwork\")\n", - " .load())\n", - "\n", - "# join the two dataframes aggregate by componentId\n", - "# and filtering for clusters with a size greater then 1\n", - "second_party_client_component_df = (clients_df.join(second_party_wcc_df, clients_df[\"\"] == second_party_wcc_df[\"nodeId\"], \"inner\")\n", - " .groupBy(\"componentId\")\n", - " .agg(count(\"*\").alias(\"count\"), collect_list(clients_df[\"id\"]).alias(\"cluster\"))\n", - " .filter(size(\"cluster\") > 1))\n", - "\n", - "second_party_client_component_df = (second_party_client_component_df\n", - " .withColumn(\"id\", explode(col(\"cluster\")))\n", - " .select(\"id\", \"componentId\"))\n", - "\n", - "\n", - "second_party_client_component_df.show(truncate=False)" - ], - "metadata": { - "id": "jKK3m8rH150-" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Code in PySpark Pandas" - ], - "metadata": { - "id": "cL0e6QHDZZZi" - } - }, - { - "cell_type": "code", - "source": [ - "second_party_wcc_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", - " \"gds\": \"gds.wcc.stream\",\n", - " \"gds.graphName\": \"SecondPartyFraudNetwork\"\n", - "})\n", - "\n", - "second_party_client_component_ps = (clients_ps.join(second_party_wcc_ps.set_index(\"nodeId\"), on=\"\")\n", - " .groupby([\"componentId\"])\n", - " .id\n", - " .apply(list)\n", - " .reset_index()\n", - " .rename(columns={'id': 'cluster'})\n", - ")\n", - "\n", - "second_party_client_component_ps = second_party_client_component_ps[second_party_client_component_ps[\"cluster\"].apply(lambda a: len(a)) > 1]\n", - "\n", - "second_party_client_component_ps = (second_party_client_component_ps\n", - " .explode(\"cluster\")\n", - " .rename(columns={'cluster': 'id'})\n", - " [[\"id\", \"componentId\"]])\n", - "\n", - "second_party_client_component_ps[:20]" - ], - "metadata": { - "id": "49OxrIuTZdb-" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### Exercise: Write the results back to database\n", - "\n", - "Write a Spark job that given the columns (`id`, `componentId`) retrieve the `Client` by the `id` and set for the node a property `secondPartyFraudGroup` with the value of `componentId`" - ], - "metadata": { - "id": "9IsIbHD7b3cL" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark" - ], - "metadata": { - "id": "1U2kzCyWdv8l" - } - }, - { - "cell_type": "code", - "source": [ - "# write your PySpark code here" - ], - "metadata": { - "id": "PTid4oiqg_lu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "(second_party_client_component_df\n", - " .withColumnRenamed(\"componentId\", \"secondPartyFraudGroup\")\n", - " .write\n", - " .format(\"org.neo4j.spark.DataSource\")\n", - " .mode(\"Overwrite\")\n", - " .option(\"labels\", \"Client\")\n", - " .option(\"node.keys\", \"id\")\n", - " .save())\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "BQB9o9JCYuOs" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Code in PySpark Pandas" - ], - "metadata": { - "id": "QgUs-jzIdyWJ" - } - }, - { - "cell_type": "code", - "source": [ - "# write your PySpark Pandas code here" - ], - "metadata": { - "id": "Copd3BPHduvJ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "
\n", - "\n", - "Show a possible solution\n", - "\n", - "\n", - "```python\n", - "(client_component_ps\n", - " .rename(columns={\"componentId\": \"secondPartyFraudGroup\"})\n", - " .spark\n", - " .to_spark_io(format=\"org.neo4j.spark.DataSource\", mode=\"Overwrite\", options={\n", - " \"labels\": \"Client\",\n", - " \"node.keys\": \"id\"\n", - " }))\n", - "```\n", - "\n", - "
" - ], - "metadata": { - "id": "7uiMcrNHY3GS" - } - }, - { - "cell_type": "markdown", - "source": [ - "#### Verify the Spark job result" - ], - "metadata": { - "id": "0mi9x4ragzyX" - } - }, - { - "cell_type": "code", - "source": [ - "secondPartyFraudGroup_check_count = %cypher -u $neo4j_url MATCH (c:Client) WHERE c.secondPartyFraudGroup IS NOT NULL RETURN count(c) AS count\n", - "assert 2433 == secondPartyFraudGroup_check_count['count'][0]\n", - "print(\"All assertion are successfuly satisfied.\")" - ], - "metadata": { - "id": "UM3TB_KxgzSh" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "xSkwu_ovYaI7" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Open this notebook in Google Colab \n", + " \"Open\n", + "" + ], + "metadata": { + "id": "ciNaixnkx1vj" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Example of a Simple data science workflow with Neo4j and Spark" + ], + "metadata": { + "id": "zADiJjnuVfq2" + } + }, + { + "cell_type": "markdown", + "source": [ + "This notebook contains a set of examples that explains how the Neo4j Spark connector can fit in you Data Scinece workflow, how you can combine Spark Neo4j and the Graph Data Science library to extract insights from your data and mostly important it allows you to test your knowledge with a set of exercises after each section.\n", + "\n", + "If you have any questions or problems feel free to write a post in the [Neo4j community forum](https://community.neo4j.com/) or in [Discord](https://discord.com/invite/neo4j).\n", + "\n", + "If you want more exercises feel free to open an issue in the [GitHub repository](https://github.com/neo4j/neo4j-spark-connector).\n", + "\n", + "Enjoy!" + ], + "metadata": { + "id": "nLucMn17V0YK" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Notes about this notebook\n", + "\n", + "This code contains a simple data science workflow that combines Neo4j's Graph Data Science Library with the Neo4j Connector for Apache Spark.\n", + "\n", + "Going forward you'll find code examples in:\n", + "\n", + "* PySpark\n", + "* PySpark Pandas\n", + "\n", + "You can choose to navigate by using one of them, or both, but we suggest you do one at time to ensure you understand the APIs." + ], + "metadata": { + "id": "pWWY8190RB98" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Create the sandbox instance\n", + "\n", + "You can easily spin-up a Neo4j sandbox by click [here](https://sandbox.neo4j.com/?usecase=fraud-detection)\n", + "\n", + "After that you'll be redirect in a webpage like this:\n", + "\n", + "\n", + "\n", + "Please click in the **Connection details tab** and copy your connection parameters into the Python variables below" + ], + "metadata": { + "id": "3hCQBmBKVaHm" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ttxf62TPVP-w" + }, + "outputs": [], + "source": [ + "neo4j_url = \"\" # put your neo4j url here" + ] + }, + { + "cell_type": "code", + "source": [ + "neo4j_user = \"neo4j\" # put your neo4j user here" + ], + "metadata": { + "id": "-lPr1hfIGtfL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "neo4j_password = \"\" # put your neo4j password here" + ], + "metadata": { + "id": "yoI29jjvGvlX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Configure the Spark Environment" + ], + "metadata": { + "id": "Capd99x5G2rm" + } + }, + { + "cell_type": "code", + "source": [ + "spark_version = '3.3.4'" + ], + "metadata": { + "id": "OiHMiko1-Qf7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!apt-get install openjdk-17-jdk-headless -qq > /dev/null" + ], + "metadata": { + "id": "qdjzLBDzGx5l" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!wget -q https://dlcdn.apache.org/spark/spark-$spark_version/spark-$spark_version-bin-hadoop3.tgz" + ], + "metadata": { + "id": "7JT9OKhzG7Lq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "A3gsnSHl0F99" + }, + "outputs": [], + "source": [ + "!tar xf spark-$spark_version-bin-hadoop3.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hSBQWKs90vSx" + }, + "outputs": [], + "source": [ + "!pip install -q findspark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tnW0a1Gj080k" + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-17-openjdk-amd64\"\n", + "os.environ[\"SPARK_HOME\"] = f\"/content/spark-{spark_version}-bin-hadoop3\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dlUBSezK1DpZ" + }, + "outputs": [], + "source": [ + "import findspark\n", + "findspark.init()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dOUJ-W871Tur" + }, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "spark = (SparkSession.builder\n", + " .master('local[*]')\n", + " .appName('Data science workflow with Neo4j and Spark')\n", + " .config('spark.ui.port', '4050')\n", + " # Just to show dataframes as tables\n", + " #.config('spark.sql.repl.eagerEval.enabled', False)\n", + " .config('spark.jars.packages', 'org.neo4j:neo4j-connector-apache-spark_2.13:6.0.0_for_spark_4')\n", + " # As we're using always the same database instance we'll\n", + " # define them as global variables\n", + " # so we don't need to repeat them each time\n", + " .config(\"neo4j.url\", neo4j_url)\n", + " .config(\"neo4j.authentication.type\", \"basic\")\n", + " .config(\"neo4j.authentication.basic.username\", neo4j_user)\n", + " .config(\"neo4j.authentication.basic.password\", neo4j_password)\n", + " .getOrCreate())\n", + "spark" + ] + }, + { + "cell_type": "code", + "source": [ + "# import utility functions that we'll use in the notebook\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.functions import *" + ], + "metadata": { + "id": "pghCcGnJWcZQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Import PySpark Pandas\n", + "\n", + "Pandas API on Apache Spark (PySpark) enables data scientists and data engineers to run their existing Pandas code on Spark. Prior to this API, you had to do a significant code rewrite from Pandas DataFrame to PySpark DataFrame which is time-consuming and error-prone.\n", + "\n", + "In this notebook we'll use both PySpark Dataframes and and PySpark Pandas.\n", + "\n", + "The only thing that we need to do is to import the library using the statement below." + ], + "metadata": { + "id": "klQ2Ah6CFBV1" + } + }, + { + "cell_type": "code", + "source": [ + "import pyspark.pandas as ps" + ], + "metadata": { + "id": "lDkBcHySCBT0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "## Exercises prerequisite\n", + "\n", + "In this notebook we and going to test your knowledge. Some of the exercises require the Neo4j Python driver to check if the exercises are being solved correctly.\n", + "\n", + "*Neo4j Python Driver is required only for verifying the exercises when you persist data from Spark to Neo4j*\n", + "\n", + "**It's not required by the Spark connector!!!**\n", + "\n", + "We'll use [Cy2Py](https://github.com/conker84/cy2py), a Jupyter extension that easily allows you to connect to Neo4j and visualize data from Jupyter notebooks.\n", + "For a detailed instruction about how to use it please dive into [this example](https://github.com/conker84/cy2py/blob/main/examples/Neo4j_Crime_Investigation_Dataset.ipynb)" + ], + "metadata": { + "id": "b6_YNZnZ5GdT" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install -q cy2py" + ], + "metadata": { + "id": "f5ZZJylo5Bbz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "let's load the extension" + ], + "metadata": { + "id": "uKYEPEgOcG2b" + } + }, + { + "cell_type": "code", + "source": [ + "%load_ext cy2py" + ], + "metadata": { + "id": "38EeXF6icKOK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "You can query the database via **cy2py** in this simple way" + ], + "metadata": { + "id": "peqcEHj0b35T" + } + }, + { + "cell_type": "code", + "source": [ + "# define the colors for the nodes\n", + "colors = {\n", + " ':Client': '#D18711',\n", + " ':Bank': '#0541B2',\n", + " ':Merchant': '#9E14AA',\n", + " ':Mule': '#6113A3',\n", + " ':CashIn': '#328918',\n", + " ':CashOut': '#C1A23D',\n", + " ':Debit': '#A32727',\n", + " ':Payment': '#3B80C4',\n", + " ':Transfer': '#088472',\n", + " ':Transaction': '#D10B4F',\n", + " ':Email': '#EA5D1E',\n", + " ':SSN': '#707070',\n", + " ':Phone': '#4B4444',\n", + "}" + ], + "metadata": { + "id": "dw2P-XpfLCJY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%%cypher -u $neo4j_url -us $neo4j_user -pw $neo4j_password -co $colors\n", + "CALL apoc.meta.graph()" + ], + "metadata": { + "id": "BfFOTNkncMqp" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Problem Definition\n" + ], + "metadata": { + "id": "d-x29ClTPBnv" + } + }, + { + "cell_type": "markdown", + "source": [ + "## What is Fraud?\n", + "Fraud occurs when an individual or group of individuals, or a business entity intentionally deceives another individual or business entity with misrepresentation of identity, products, services, or financial transactions and/or false promises with no intention of fulfilling them." + ], + "metadata": { + "id": "79q5QJfcPMa6" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Fraud Categories\n" + ], + "metadata": { + "id": "naUmXhC-PQGR" + } + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "" + }, + { + "cell_type": "markdown", + "source": [ + "### First-party Fraud\n", + "An individual, or group of individuals, misrepresent their identity or give false information when applying for a product or services to receive more favourable rates or when have no intention of repayment." + ], + "metadata": { + "id": "edTfWFSAPUKF" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Second-party Fraud\n", + "An individual knowingly gives their identity or personal information to another individual to commit fraud or someone is perpetrating fraud in his behalf." + ], + "metadata": { + "id": "Zr9sGs_9PYmH" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Third-party Fraud\n", + "An individual, or a group of individuals, create or use another person’s identity, or personal details, to open or takeover an account." + ], + "metadata": { + "id": "o45K16ryPcBu" + } + }, + { + "cell_type": "markdown", + "source": [ + "## The dataset\n", + "\n", + "We will use Paysim dataset for the hands-on exercises. Paysim is a synthetic dataset that mimics real world mobile money transfer network.\n", + "\n", + "For more information on the dataset, please visit this [blog page](https://www.sisu.io/posts/paysim/)" + ], + "metadata": { + "id": "dFje1N1cPq_9" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "CALL apoc.meta.graph()" + ], + "metadata": { + "id": "AAeicV33PDXa" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "There are five types of transactions in this database. List all transaction types and corresponding metrics by iterating over all the transactions." + ], + "metadata": { + "id": "Ux5tg_OzUvgT" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark" + ], + "metadata": { + "id": "viWCvG1MU632" + } + }, + { + "cell_type": "code", + "source": [ + "transaction_df = (spark.read\n", + " .format('org.neo4j.spark.DataSource')\n", + " .option('labels', ':Transaction')\n", + " .load())\n", + "\n", + "transaction_df_count = transaction_df.count()\n", + "\n", + "transaction_df = (transaction_df.groupBy('')\n", + " .count()\n", + " .withColumnRenamed('', 'transaction'))\n", + "\n", + "transaction_df = (transaction_df\n", + " .withColumn('transaction', transaction_df['transaction'].getItem(0))\n", + " .withColumn('% transactions', transaction_df['count'] / transaction_df_count))\n", + "\n", + "transaction_df.show(truncate=False)" + ], + "metadata": { + "id": "xsIlmR-EQLeb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark Pandas" + ], + "metadata": { + "id": "U2JUpe4NU6Jz" + } + }, + { + "cell_type": "code", + "source": [ + "transaction_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\"labels\": \"Transaction\"})\n", + "\n", + "transaction_ps_count = transaction_ps.count()[0] * 1.0\n", + "\n", + "transaction_ps = (transaction_ps.groupby([''])\n", + " .size()\n", + " .reset_index(name='% transactions'))\n", + "\n", + "transaction_ps = transaction_ps.rename(columns={'': 'label'})\n", + "\n", + "transaction_ps['% transactions'] = transaction_ps['% transactions'].astype(float).div(transaction_ps_count * 1.0)\n", + "\n", + "transaction_ps.label = [x[0] for x in transaction_ps.label.to_numpy()]\n", + "\n", + "transaction_ps" + ], + "metadata": { + "id": "rfkeCU0PVFXx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "##### Plot the data\n", + "You can also use Python libraries like [Ploty](https://plotly.com/python/) to plot results" + ], + "metadata": { + "id": "t2RufDichKkQ" + } + }, + { + "cell_type": "code", + "source": [ + "import plotly.express as px\n", + "\n", + "# we use to_pandas() in order to transform the PySpark Pandas to a real Pandas Dataframe\n", + "fig = px.pie(transaction_ps.to_pandas(), values='% transactions', names='label')\n", + "\n", + "fig.show()" + ], + "metadata": { + "id": "Wn4HMYGVhV1t" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Exploit first-party Fraud\n", + "\n", + "Synthetic identity fraud and first party fraud can be identified by performing entity link analysis to detect identities linked to other identities via shared PII.\n", + "\n", + "There are three types of personally identifiable information (PII) in this dataset - SSN, Email and Phone Number\n", + "\n", + "Our hypothesis is that clients who share identifiers are suspicious and have a higher potential to commit fraud. However, all shared identifier links are not suspicious, for example, two people sharing an email address. Hence, we compute a fraud score based on shared PII relationships and label the top X percentile clients as fraudsters.\n", + "\n", + "We will first identify clients that share identifiers and create a new relationship between clients that share identifiers" + ], + "metadata": { + "id": "VwPKEtu2QLlv" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Enrich the dataset" + ], + "metadata": { + "id": "E162NudWkW2n" + } + }, + { + "cell_type": "markdown", + "source": [ + "In order to perfrorm our investigation we want to enrich the base dataset by identifing clients that share PII." + ], + "metadata": { + "id": "Y3MfFaKqH7Lm" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (c1:Client)-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(n)<-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]-(c2:Client)\n", + "WHERE id(c1) < id(c2)\n", + "RETURN c1.id, c2.id, count(*) AS freq\n", + "ORDER BY freq DESC;" + ], + "metadata": { + "id": "zJfRBlNNP9A1" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now we can reuse the same Cypher query for creating our Dataframe and then use the Neo4j Spark Connector to create a new `SHARED_IDENTIFIERS` relationship betwen two clients:\n", + "\n", + "**(:Client)-[:SHARED_IDENTIFIERS]->(:Client)**\n", + "\n" + ], + "metadata": { + "id": "4J6d8U8bkMW_" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "// let's check if there relationships are in there\n", + "MATCH (c:Client)-[r:SHARED_IDENTIFIERS]->(c2:Client)\n", + "RETURN *\n", + "LIMIT 10" + ], + "metadata": { + "id": "sQ7Nf_IUQQ1J" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "As you can see there are no relationships in the database" + ], + "metadata": { + "id": "6zUwaYxgQhbC" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Code in PySpark" + ], + "metadata": { + "id": "7CAiMFJ3LPmP" + } + }, + { + "cell_type": "code", + "source": [ + "shared_identifiers_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", + " .option(\"query\", \"\"\"\n", + " MATCH (c1:Client)-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(n)<-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]-(c2:Client)\n", + " WHERE id(c1) < id(c2)\n", + " RETURN c1.id AS source, c2.id AS target, count(*) AS freq\n", + " \"\"\")\n", + " .load())\n", + "\n", + "(shared_identifiers_df.write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"relationship\", \"SHARED_IDENTIFIERS\")\n", + " .option(\"relationship.save.strategy\", \"keys\")\n", + " .option(\"relationship.source.labels\", \":Client\")\n", + " .option(\"relationship.source.save.mode\", \"Overwrite\")\n", + " .option(\"relationship.source.node.keys\", \"source:id\")\n", + " .option(\"relationship.target.labels\", \":Client\")\n", + " .option(\"relationship.target.node.keys\", \"target:id\")\n", + " .option(\"relationship.target.save.mode\", \"Overwrite\")\n", + " .option(\"relationship.properties\", \"freq:count\")\n", + " .save())" + ], + "metadata": { + "id": "36irb2nuj5Hi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Code in PySpark Pandas" + ], + "metadata": { + "id": "h07CRHnlLU-G" + } + }, + { + "cell_type": "code", + "source": [ + "shared_identifiers_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\"query\": \"\"\"\n", + " MATCH (c1:Client)-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(n)<-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]-(c2:Client)\n", + " WHERE id(c1) < id(c2)\n", + " RETURN c1.id AS source, c2.id AS target, count(*) AS freq\n", + "\"\"\"})\n", + "\n", + "shared_identifiers_ps.spark.to_spark_io(format=\"org.neo4j.spark.DataSource\", mode=\"Overwrite\", options={\n", + " \"relationship\": \"SHARED_IDENTIFIERS\",\n", + " \"relationship.save.strategy\": \"keys\",\n", + " \"relationship.source.labels\": \":Client\",\n", + " \"relationship.source.save.mode\": \"Overwrite\",\n", + " \"relationship.source.node.keys\": \"source:id\",\n", + " \"relationship.target.labels\": \":Client\",\n", + " \"relationship.target.node.keys\": \"target:id\",\n", + " \"relationship.target.save.mode\": \"Overwrite\",\n", + " \"relationship.properties\": \"freq:count\"\n", + "})" + ], + "metadata": { + "id": "vZ6So9x_MBY_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "// let's check (again) if there relationships are in there\n", + "MATCH (c:Client)-[r:SHARED_IDENTIFIERS]->(c2:Client)\n", + "RETURN *\n", + "LIMIT 10" + ], + "metadata": { + "id": "A6iUPMYMQAzF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Build Fraud detection workflow in Neo4j GDS\n", + "\n", + "We will construct a workflow with graph algorithms to detect fraud rings, score clients based on the number of common connections and rank them to select the top few suspicious clients and label them as fraudsters.\n", + "\n", + "1. Identify clusters of clients sharing PII using a community detection algorithm (Weakly Connected Components)\n", + "2. Find similar clients within the clusters using pairwise similarity algorithms (Node Similarity)\n", + "3. Calculate and assign fraud score to clients using centrality algorithms (Degree Centrality)\n", + "4. Use computed fraud scores to label clients as potential fraudsters" + ], + "metadata": { + "id": "Fy8roSDHQw3h" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Identify groups of clients sharing PII (Fraud rings)\n", + "\n", + "Run Weakly connected components to find clusters of clients sharing PII.\n", + "\n", + "Weakly Connected Components is used to find groups of connected nodes, where all nodes in the same set form a connected component. WCC is often used early in an analysis understand the structure of a graph. More informaton here: [WCC documentation](https://neo4j.com/docs/graph-data-science/current/algorithms/wcc/)" + ], + "metadata": { + "id": "bysMYQ23WFVl" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Create a graph projection\n", + "\n", + "A central concept in the GDS library is the management of in-memory graphs. Graph algorithms run on a graph data model which is a projection of the Neo4j property graph data model. For more information, please click here: [Graph Management](https://neo4j.com/docs/graph-data-science/current/management-ops/)\n", + "\n", + "A projected graph can be stored in the catalog under a user-defined name. Using that name, the graph can be referred to by any algorithm in the library." + ], + "metadata": { + "id": "39g6Fq1dTgLt" + } + }, + { + "cell_type": "markdown", + "source": [ + "Consider that the original Cypher query is the following:\n", + "```cypher\n", + "CALL gds.graph.project('wcc',\n", + " {\n", + " Client: {\n", + " label: 'Client'\n", + " }\n", + " },\n", + " {\n", + " SHARED_IDENTIFIERS:{\n", + " type: 'SHARED_IDENTIFIERS',\n", + " orientation: 'UNDIRECTED',\n", + " properties: {\n", + " count: {\n", + " property: 'count'\n", + " }\n", + " }\n", + " }\n", + " }\n", + ") YIELD graphName,nodeCount,relationshipCount,projectMillis;\n", + "```\n", + "\n", + "which will be translate into:" + ], + "metadata": { + "id": "fXQwdpJfVGIq" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark" + ], + "metadata": { + "id": "-fkvDjHcUV5Z" + } + }, + { + "cell_type": "code", + "source": [ + "wcc_graph_proj_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", + " .option(\"gds\", \"gds.graph.project\")\n", + " .option(\"gds.graphName\", \"wcc\")\n", + " .option(\"gds.nodeProjection.Client.label\", \"Client\")\n", + " .option(\"gds.relationshipProjection.SHARED_IDENTIFIERS.type\", \"SHARED_IDENTIFIERS\")\n", + " .option(\"gds.relationshipProjection.SHARED_IDENTIFIERS.orientation\", \"UNDIRECTED\")\n", + " .option(\"gds.relationshipProjection.SHARED_IDENTIFIERS.properties.count.property\", \"count\")\n", + " .load())\n", + "\n", + "wcc_graph_proj_df.show(truncate=False)" + ], + "metadata": { + "id": "4dBHWHh8R7US" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark Pandas" + ], + "metadata": { + "id": "cauhX4FRVYMy" + } + }, + { + "cell_type": "code", + "source": [ + "wcc_graph_proj_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", + " \"gds\": \"gds.graph.project\",\n", + " \"gds.graphName\": \"wcc\",\n", + " \"gds.nodeProjection.Client.label\": \"Client\",\n", + " \"gds.relationshipProjection.SHARED_IDENTIFIERS.type\": \"SHARED_IDENTIFIERS\",\n", + " \"gds.relationshipProjection.SHARED_IDENTIFIERS.orientation\": \"UNDIRECTED\",\n", + " \"gds.relationshipProjection.SHARED_IDENTIFIERS.properties.count.property\": \"count\"\n", + "})\n", + "\n", + "wcc_graph_proj_ps" + ], + "metadata": { + "id": "ZndBGbXPVeqU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Run the WCC algorithm\n", + "\n", + "The original Cypher query is:\n", + "\n", + "```cypher\n", + "CALL gds.wcc.stream('wcc',\n", + " {\n", + " nodeLabels: ['Client'],\n", + " relationshipTypes: ['SHARED_IDENTIFIERS'],\n", + " consecutiveIds: true\n", + " }\n", + ")\n", + "YIELD nodeId, componentId\n", + "RETURN gds.util.asNode(nodeId).id AS clientId, componentId\n", + "ORDER BY componentId\n", + "LIMIT 20\n", + "```\n", + "\n", + "which is transate into:" + ], + "metadata": { + "id": "P4oIKsUNn-ZH" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark" + ], + "metadata": { + "id": "Ygw7T3lSWbsQ" + } + }, + { + "cell_type": "code", + "source": [ + "# get the clients\n", + "clients_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", + " .option(\"labels\", \"Client\")\n", + " .load())\n", + "\n", + "# invoke the gds wcc stream procedure\n", + "wcc_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", + " .option(\"gds\", \"gds.wcc.stream\")\n", + " .option(\"gds.graphName\", \"wcc\")\n", + " .option(\"gds.nodeLabels\", \"['Client']\")\n", + " .option(\"gds.relationshipTypes\", \"['SHARED_IDENTIFIERS']\")\n", + " .option(\"gds.consecutiveIds\", \"true\")\n", + " .load())\n", + "\n", + "# join the two dataframes and show id, componentId\n", + "client_component_df = (clients_df.join(wcc_df, clients_df[\"\"] == wcc_df[\"nodeId\"], \"inner\")\n", + " .select(\"id\", \"componentId\"))\n", + "\n", + "client_component_df.show(truncate=False)" + ], + "metadata": { + "id": "6RtkJV9GWHnu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark Pandas" + ], + "metadata": { + "id": "xQI9gQNQYMWe" + } + }, + { + "cell_type": "code", + "source": [ + "# get the clients\n", + "clients_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\"labels\": \"Client\"})\n", + "\n", + "# invoke the gds wcc stream procedure\n", + "wcc_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", + " \"gds\": \"gds.wcc.stream\",\n", + " \"gds.graphName\": \"wcc\",\n", + " \"gds.nodeLabels\": \"['Client']\",\n", + " \"gds.relationshipTypes\": \"['SHARED_IDENTIFIERS']\",\n", + " \"gds.consecutiveIds\": \"true\"\n", + "})\n", + "\n", + "# join the two pandas df and show id, componentId\n", + "client_component_ps = clients_ps.join(wcc_ps.set_index(\"nodeId\"), on=\"\")[[\"id\", \"componentId\"]]\n", + "\n", + "# we show only the first 20\n", + "client_component_ps[:20]" + ], + "metadata": { + "id": "AvlUnpIQYQsZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Write results to the database.\n", + "Now that we identified clusters of clients sharing PII, we want to store these results back into the database by enriching the `Client` node.\n", + "We'll add the component id of the cluster as `firstPartyFraudGroup` property" + ], + "metadata": { + "id": "eNgmAuheZqfA" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark" + ], + "metadata": { + "id": "EKNcuklDaRKY" + } + }, + { + "cell_type": "code", + "source": [ + "(client_component_df\n", + " .withColumnRenamed(\"componentId\", \"firstPartyFraudGroup\")\n", + " .write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"labels\", \"Client\")\n", + " .option(\"node.keys\", \"id\")\n", + " .save())" + ], + "metadata": { + "id": "yQb0H-p7ZrRP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark Pandas" + ], + "metadata": { + "id": "-HK28CGoa5_p" + } + }, + { + "cell_type": "code", + "source": [ + "(client_component_ps\n", + " .rename(columns={\"componentId\": \"firstPartyFraudGroup\"})\n", + " .spark\n", + " .to_spark_io(format=\"org.neo4j.spark.DataSource\", mode=\"Overwrite\", options={\n", + " \"labels\": \"Client\",\n", + " \"node.keys\": \"id\"\n", + " }))" + ], + "metadata": { + "id": "zJDS-0bta8_s" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "// Visualize clusters with greater than 9 client nodes.\n", + "MATCH (c:Client)\n", + "WITH c.firstPartyFraudGroup AS fpGroupID, collect(c.id) AS fGroup\n", + "WITH *, size(fGroup) AS groupSize WHERE groupSize >= 9\n", + "WITH * LIMIT 1\n", + "MATCH p=(c:Client)-[:HAS_SSN|HAS_EMAIL|HAS_PHONE]->()\n", + "WHERE c.firstPartyFraudGroup = fpGroupID\n", + "RETURN p" + ], + "metadata": { + "id": "oOsrNUZocx21" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Compute pairwise similarity scores\n", + "\n", + "We use node similarity algorithm to find similar nodes based on the relationships to other nodes. Node similarity uses Jaccard metric ([Node Similarity](https://neo4j.com/docs/graph-data-science/current/algorithms/node-similarity/#algorithms-node-similarity))\n", + "\n", + "Node similarity algorithms work on bipartite graphs (two types of nodes and relationships between them). Here we project client nodes (one type) and three identifiers nodes (that are considered as second type) into memory." + ], + "metadata": { + "id": "5CCwYp1FfoMU" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Project the graph\n", + "\n", + "The original Cypher query is\n", + "\n", + "```cypher\n", + "MATCH(c:Client) WHERE c.firstPartyFraudGroup is not NULL\n", + "WITH collect(c) as clients\n", + "MATCH(n) WHERE n:Email OR n:Phone OR n:SSN\n", + "WITH clients, collect(n) as identifiers\n", + "WITH clients + identifiers as nodes\n", + "\n", + "MATCH(c:Client) -[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(id)\n", + "WHERE c.firstPartyFraudGroup is not NULL\n", + "WITH nodes, collect({source: c, target: id}) as relationships\n", + "\n", + "CALL gds.graph.project.cypher('similarity',\n", + " \"UNWIND $nodes as n RETURN id(n) AS id,labels(n) AS labels\",\n", + " \"UNWIND $relationships as r RETURN id(r['source']) AS source, id(r['target']) AS target, 'HAS_IDENTIFIER' as type\",\n", + " { parameters: {nodes: nodes, relationships: relationships}}\n", + ")\n", + "YIELD graphName, nodeCount, relationshipCount, projectMillis\n", + "RETURN graphName, nodeCount, relationshipCount, projectMillis\n", + "```\n", + "\n", + "Which is translated into" + ], + "metadata": { + "id": "aLGQxFtpnQHa" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark" + ], + "metadata": { + "id": "aPSY4htNgLVG" + } + }, + { + "cell_type": "code", + "source": [ + "similarity_graph_proj_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", + " .option(\"gds\", \"gds.graph.project.cypher\")\n", + " .option(\"gds.graphName\", \"similarity\")\n", + " .option(\"gds.nodeQuery\", \"\"\"\n", + " MATCH (n)\n", + " WHERE (n:Client AND n.firstPartyFraudGroup is not NULL) OR n:Email OR n:Phone OR n:SSN\n", + " RETURN id(n) AS id, labels(n) AS labels\n", + " \"\"\")\n", + " .option(\"gds.relationshipQuery\", \"\"\"\n", + " MATCH (s:Client)-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(t)\n", + " WHERE s.firstPartyFraudGroup is not NULL\n", + " RETURN id(s) AS source, id(t) AS target, 'HAS_IDENTIFIER' as type\n", + " \"\"\")\n", + " .load())\n", + "\n", + "similarity_graph_proj_df.show(truncate=False)" + ], + "metadata": { + "id": "eNvVkJTRfuqM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark Pandas" + ], + "metadata": { + "id": "PcJCKDDhmbtr" + } + }, + { + "cell_type": "code", + "source": [ + "similarity_graph_proj_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", + " \"gds\": \"gds.graph.project.cypher\",\n", + " \"gds.graphName\": \"similarity\",\n", + " \"gds.nodeQuery\": \"\"\"\n", + " MATCH (n)\n", + " WHERE (n:Client AND n.firstPartyFraudGroup is not NULL) OR n:Email OR n:Phone OR n:SSN\n", + " RETURN id(n) AS id, labels(n) AS labels\n", + " \"\"\",\n", + " \"gds.relationshipQuery\": \"\"\"\n", + " MATCH (s:Client)-[:HAS_EMAIL|:HAS_PHONE|:HAS_SSN]->(t)\n", + " WHERE s.firstPartyFraudGroup is not NULL\n", + " RETURN id(s) AS source, id(t) AS target, 'HAS_IDENTIFIER' as type\n", + " \"\"\"\n", + "})\n", + "\n", + "similarity_graph_proj_ps" + ], + "metadata": { + "id": "odriamHwmiEQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Compute the node similarity\n", + "\n", + "We can mutate in-memory graph by writing outputs from the algorithm as node or relationship properties.\n", + "\n", + "In this particular case all the procedures with `mutate` and `write` suffix are not supported from the Neo4j Spark Connector, in this case we'll write a Cypher query:\n" + ], + "metadata": { + "id": "TfnVREvvmzG4" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "CALL gds.nodeSimilarity.mutate('similarity',\n", + " {\n", + " topK:15,\n", + " mutateProperty: 'jaccardScore',\n", + " mutateRelationshipType:'SIMILAR_TO'\n", + " }\n", + ");" + ], + "metadata": { + "id": "E0_WoCLlnIr6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Mutate mode is very fast compared to write mode and it helps in optimizing algorithm execution times, then we write back the property from in-memory graph to the database and use it for further analysis:" + ], + "metadata": { + "id": "JnC3C7urPyN_" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "CALL gds.graph.writeRelationship('similarity', 'SIMILAR_TO', 'jaccardScore');" + ], + "metadata": { + "id": "xajV8enLPvIN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Exercise: Calculate First-party Fraud Score\n", + "\n", + "We compute first party fraud score using weighted degree centrality algorithm.\n", + "\n", + "In this step, we compute and assign fraud score (`firstPartyFraudScore`) to clients in the clusters identified in previous steps based on `SIMILAR_TO` relationships weighted by `jaccardScore`\n", + "\n", + "Weighted degree centrality algorithm add up similarity scores (`jaccardScore`) on the incoming `SIMILAR_TO` relationships for a given node in a cluster and assign the sum as the corresponding `firstPartyFraudScore`. This score represents clients who are similar to many others in the cluster in terms of sharing identifiers. Higher `firstPartyFraudScore` represents greater potential for committing fraud." + ], + "metadata": { + "id": "qpXXoWeIQk9U" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Code in PySpark" + ], + "metadata": { + "id": "fAp_acV-RBOu" + } + }, + { + "cell_type": "code", + "source": [ + "# invoke the gds.degree.stream procedure" + ], + "metadata": { + "id": "ZPAHXvT6Qouy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "similarity_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", + " .option(\"gds\", \"gds.degree.stream\")\n", + " .option(\"gds.graphName\", \"similarity\")\n", + " .option(\"gds.nodeLabels\", \"['Client']\")\n", + " .option(\"gds.relationshipTypes\", \"['SIMILAR_TO']\")\n", + " .option(\"gds.relationshipWeightProperty\", \"jaccardScore\")\n", + " .load())\n", + "\n", + "# join the two dataframes and show id, score\n", + "client_similarity_df = (clients_df.join(similarity_df, clients_df[\"\"] == similarity_df[\"nodeId\"], \"inner\")\n", + " .select(\"id\", \"score\")\n", + " .withColumnRenamed(\"score\", \"firstPartyFraudScore\"))\n", + "\n", + "# write the results back to the database\n", + "(client_similarity_df.write.format('org.neo4j.spark.DataSource')\n", + " .mode(\"Overwrite\")\n", + " .option(\"labels\", \"Client\")\n", + " .option(\"node.keys\", \"id\")\n", + " .save())\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "1qKbMW3FSwLE" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Code in PySpark Pandas\n", + "\n" + ], + "metadata": { + "id": "na2h6VLov-ZA" + } + }, + { + "cell_type": "code", + "source": [ + "# invoke the gds.degree.stream procedure" + ], + "metadata": { + "id": "7ObUlkdmwDzi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "similarity_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", + " \"gds\": \"gds.degree.stream\",\n", + " \"gds.graphName\": \"similarity\",\n", + " \"gds.nodeLabels\": \"['Client']\",\n", + " \"gds.relationshipTypes\": \"['SIMILAR_TO']\",\n", + " \"gds.relationshipWeightProperty\": \"jaccardScore\"\n", + "})\n", + "\n", + "# join the two pandas df and show id, score\n", + "client_similarity_ps = (clients_ps.join(similarity_ps.set_index(\"nodeId\"), on=\"\")[[\"id\", \"score\"]]\n", + " .rename(columns={\"score\": \"firstPartyFraudScore\"}))\n", + "\n", + "# write the results back to the database\n", + "client_similarity_ps.spark.to_spark_io(format=\"org.neo4j.spark.DataSource\", mode=\"Overwrite\", options={\n", + " \"labels\": \"Client\",\n", + " \"node.keys\": \"id\"\n", + "})\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "m8Cy4NtsS3nQ" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Verifiy the result\n", + "\n", + "We expect that:\n", + "- `similarity_df`/`similarity_ps`\n", + " - has two columns:\n", + " - `nodeId` of long type\n", + " - `score` of double type\n", + " - a count of **9134** rows\n", + "- `client_similarity_df`/`client_similarity_ps`\n", + " - has two columns:\n", + " - `id` of long type\n", + " - `score` of double type\n", + " - a count of 2433 rows" + ], + "metadata": { + "id": "I_TKq9TmU3cS" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Test PySpark Dataframe" + ], + "metadata": { + "id": "Dow9PWAtxBPu" + } + }, + { + "cell_type": "code", + "source": [ + "assert StructType([StructField(\"nodeId\", LongType()), StructField(\"score\", DoubleType())]) == similarity_df.schema\n", + "assert 9134 == similarity_df.count()\n", + "\n", + "assert StructType([StructField(\"id\", StringType()), StructField(\"firstPartyFraudScore\", DoubleType())]) == client_similarity_df.schema\n", + "assert 2433 == client_similarity_df.count()\n", + "print(\"All assertion are successfuly satisfied.\")" + ], + "metadata": { + "id": "5E6fUHwZU7PN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Test PySpark Pandas" + ], + "metadata": { + "id": "feuIfTCqxGKt" + } + }, + { + "cell_type": "code", + "source": [ + "assert StructType([StructField(\"nodeId\", LongType()), StructField(\"score\", DoubleType())]) == similarity_ps.to_spark().schema\n", + "assert 9134 == similarity_ps.count()[0]\n", + "\n", + "assert StructType([StructField(\"id\", StringType()), StructField(\"firstPartyFraudScore\", DoubleType())]) == client_similarity_ps.to_spark().schema\n", + "assert 2433 == client_similarity_ps.count()[0]\n", + "print(\"All assertion are successfuly satisfied.\")" + ], + "metadata": { + "id": "OHel2EYoxFpC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We find clients with first-party fraud score greater than some threshold (X) and label those top X percentile clients as fraudsters. In this example, using 95th percentile as a threshold, we set a property FirstPartyFraudster on the Client node." + ], + "metadata": { + "id": "YBhfX5DYXsom" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (c:Client)\n", + "WHERE c.firstPartyFraudScore IS NOT NULL\n", + "WITH percentileCont(c.firstPartyFraudScore, 0.95) AS firstPartyFraudThreshold\n", + "MATCH (c:Client)\n", + "WHERE c.firstPartyFraudScore > firstPartyFraudThreshold\n", + "SET c:FirstPartyFraudster" + ], + "metadata": { + "id": "bSb4rZ-aXvYV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Second-party Fraud / Money Mules\n", + "\n", + "The first step is to find out clients who weren't identified as first party fraudsters but they transact with first party fraudsters." + ], + "metadata": { + "id": "WyKyntZ_YC1g" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH p=(:Client:FirstPartyFraudster)-[]-(:Transaction)-[]-(c:Client)\n", + "WHERE NOT c:FirstPartyFraudster\n", + "RETURN p\n", + "LIMIT 50" + ], + "metadata": { + "id": "C94Yg_psYTRW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Also, lets find out what types of transactions do these Clients perform with first party fraudsters" + ], + "metadata": { + "id": "Tm9WYZnwsQif" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (:Client:FirstPartyFraudster)-[]-(txn:Transaction)-[]-(c:Client)\n", + "WHERE NOT c:FirstPartyFraudster\n", + "UNWIND labels(txn) AS transactionType\n", + "RETURN transactionType, count(*) AS freq" + ], + "metadata": { + "id": "x2P9Y7IzY2Vl" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Create new relationships\n", + "\n", + "Let’s go ahead and create `TRANSFER_TO` relationships between clients with `firstPartyFraudster` tags and other clients. Also add the total amount from all such transactions as a property on `TRANSFER_TO` relationships.\n", + "\n", + "Since the total amount transferred from a fraudster to a client and the total amount transferred in the reverse direction are not the same, we have to create relationships in two separate queries.\n", + "\n", + "* `TRANSFER_TO` relationship from a fraudster to a client (look at the directions in queries)\n", + "* Add `SecondPartyFraudSuspect` tag to these clients" + ], + "metadata": { + "id": "f2HTap1vuLBW" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (c1:FirstPartyFraudster)-[]->(t:Transaction)-[]->(c2:Client)\n", + "WHERE NOT c2:FirstPartyFraudster\n", + "WITH c1, c2, sum(t.amount) AS totalAmount\n", + "SET c2:SecondPartyFraudSuspect\n", + "CREATE (c1)-[:TRANSFER_TO {amount:totalAmount}]->(c2)" + ], + "metadata": { + "id": "lfGkbJk2ueau" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "* `TRANSFER_TO` relationship from a client to a fraudster." + ], + "metadata": { + "id": "6yO2wJP6uksA" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH (c1:FirstPartyFraudster)<-[]-(t:Transaction)<-[]-(c2:Client)\n", + "WHERE NOT c2:FirstPartyFraudster\n", + "WITH c1, c2, sum(t.amount) AS totalAmount\n", + "SET c2:SecondPartyFraudSuspect\n", + "CREATE (c1)<-[:TRANSFER_TO {amount:totalAmount}]-(c2);" + ], + "metadata": { + "id": "WUeeP4MhujzM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Visualize newly created `TRANSFER_TO` relationships" + ], + "metadata": { + "id": "68rcS85lutUx" + } + }, + { + "cell_type": "code", + "source": [ + "%%cypher\n", + "MATCH p=(:Client:FirstPartyFraudster)-[:TRANSFER_TO]-(c:Client)\n", + "WHERE NOT c:FirstPartyFraudster\n", + "RETURN p\n", + "LIMIT 50" + ], + "metadata": { + "id": "LC5kSXOGuu1w" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Goal\n", + "\n", + "Our objective is to find out clients who may have supported the first party fraudsters and were not identified as potential first party fraudsters.\n", + "\n", + "Our hypothesis is that clients who perform transactions of type `Transfer` where they either send or receive money from first party fraudsters are flagged as suspects for second party fraud.\n", + "\n", + "To identify such clients, make use of `TRANSFER_TO` relationships and use this recipe:\n", + "\n", + "* Use **WCC** (community detection) to identify networks of clients who are connected to first party fraudsters\n", + "* Use **PageRank** (centrality) to score clients based on their influence in terms of the amount of money transferred to/from fraudsters\n", + "* Assign risk score (`secondPartyFraudScore`) to these clients" + ], + "metadata": { + "id": "6swyguhivCEL" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Exercise: Project the graph\n", + "\n", + "Let’s use native projection and create an in-memory graph with Client nodes and TRANSFER_TO relationships.\n", + "\n", + "We want to project:\n", + "* `Client` for `nodeProjection`\n", + "* `TRANSFER_TO` for `relationshipProjection`\n", + "* for the configuration we want to set `relationshipProperties` to `amount`" + ], + "metadata": { + "id": "ILntBuwxvXCz" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Code in PySpark" + ], + "metadata": { + "id": "zgKufo0Z2C8e" + } + }, + { + "cell_type": "code", + "source": [ + "second_party_graph_proj_df = # insert your PySpark code here" + ], + "metadata": { + "id": "JmhR2nhwvWRu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "second_party_graph_proj_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", + " .option(\"gds\", \"gds.graph.project\")\n", + " .option(\"gds.graphName\", \"SecondPartyFraudNetwork\")\n", + " .option(\"gds.nodeProjection\", \"Client\")\n", + " .option(\"gds.relationshipProjection\", \"TRANSFER_TO\")\n", + " .option(\"gds.configuration.relationshipProperties\", \"amount\")\n", + " .load())\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "Ab6GEhJ_Tsrx" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Code in PySpark Pandas" + ], + "metadata": { + "id": "QHnI8W5J2Gcp" + } + }, + { + "cell_type": "code", + "source": [ + "second_party_graph_proj_ps = # insert your PySpark Pandas code here" + ], + "metadata": { + "id": "kD6vXaab3JdM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "second_party_graph_proj_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", + " \"gds\": \"gds.graph.project\",\n", + " \"gds.graphName\": \"SecondPartyFraudNetwork\",\n", + " \"gds.nodeProjection\": \"Client\",\n", + " \"gds.relationshipProjection\": \"TRANSFER_TO\",\n", + " \"gds.configuration.relationshipProperties\": \"amount\"\n", + "})\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "c0VjV2rBTy10" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Verify the projection" + ], + "metadata": { + "id": "ZVul68Wi162b" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Test PySpark Dataframe" + ], + "metadata": { + "id": "jlOuUGA833ny" + } + }, + { + "cell_type": "code", + "source": [ + "second_party_graph_proj_df.cache()\n", + "\n", + "first_row = [\n", + " {\n", + " \"node\": list(row[\"nodeProjection\"].keys())[0],\n", + " \"rel\": list(row[\"relationshipProjection\"].keys())[0],\n", + " \"graphName\": row[\"graphName\"],\n", + " \"nodeCount\": row[\"nodeCount\"],\n", + " \"relCount\": row[\"relationshipCount\"]\n", + " } for row in second_party_graph_proj_df.collect()\n", + "][0]\n", + "\n", + "assert first_row == {\"node\": \"Client\", \"rel\": \"TRANSFER_TO\", \"graphName\": \"SecondPartyFraudNetwork\", \"nodeCount\": 2433, \"relCount\": 367}\n", + "print(\"All assertion are successfuly satisfied.\")" + ], + "metadata": { + "id": "hkjx6Nu339wW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### Test PySpark Pandas" + ], + "metadata": { + "id": "37mDMX8G4EPE" + } + }, + { + "cell_type": "code", + "source": [ + "second_party_graph_proj_ps_df = second_party_graph_proj_ps.to_spark()\n", + "\n", + "second_party_graph_proj_ps_df.cache()\n", + "\n", + "first_row = [\n", + " {\n", + " \"node\": list(row[\"nodeProjection\"].keys())[0],\n", + " \"rel\": list(row[\"relationshipProjection\"].keys())[0],\n", + " \"graphName\": row[\"graphName\"],\n", + " \"nodeCount\": row[\"nodeCount\"],\n", + " \"relCount\": row[\"relationshipCount\"]\n", + " } for row in second_party_graph_proj_ps_df.collect()\n", + "][0]\n", + "\n", + "assert first_row == {\"node\": \"Client\", \"rel\": \"TRANSFER_TO\", \"graphName\": \"SecondPartyFraudNetwork\", \"nodeCount\": 2433, \"relCount\": 367}\n", + "print(\"All assertion are successfuly satisfied.\")" + ], + "metadata": { + "id": "spF535cp4IcR" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Check clusters with more than one clients\n", + "We will see if there are any clusters with more than one clients in them and if there are, then we should add a tag `secondPartyFraudGroup` to find them later using local queries." + ], + "metadata": { + "id": "32FL1L324QxV" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Code in PySpark" + ], + "metadata": { + "id": "UljCEN-s40mc" + } + }, + { + "cell_type": "code", + "source": [ + "# invoke gds.wcc.stream on\n", + "second_party_wcc_df = (spark.read.format(\"org.neo4j.spark.DataSource\")\n", + " .option(\"gds\", \"gds.wcc.stream\")\n", + " .option(\"gds.graphName\", \"SecondPartyFraudNetwork\")\n", + " .load())\n", + "\n", + "# join the two dataframes aggregate by componentId\n", + "# and filtering for clusters with a size greater then 1\n", + "second_party_client_component_df = (clients_df.join(second_party_wcc_df, clients_df[\"\"] == second_party_wcc_df[\"nodeId\"], \"inner\")\n", + " .groupBy(\"componentId\")\n", + " .agg(count(\"*\").alias(\"count\"), collect_list(clients_df[\"id\"]).alias(\"cluster\"))\n", + " .filter(size(\"cluster\") > 1))\n", + "\n", + "second_party_client_component_df = (second_party_client_component_df\n", + " .withColumn(\"id\", explode(col(\"cluster\")))\n", + " .select(\"id\", \"componentId\"))\n", + "\n", + "\n", + "second_party_client_component_df.show(truncate=False)" + ], + "metadata": { + "id": "jKK3m8rH150-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Code in PySpark Pandas" + ], + "metadata": { + "id": "cL0e6QHDZZZi" + } + }, + { + "cell_type": "code", + "source": [ + "second_party_wcc_ps = ps.read_spark_io(format=\"org.neo4j.spark.DataSource\", options={\n", + " \"gds\": \"gds.wcc.stream\",\n", + " \"gds.graphName\": \"SecondPartyFraudNetwork\"\n", + "})\n", + "\n", + "second_party_client_component_ps = (clients_ps.join(second_party_wcc_ps.set_index(\"nodeId\"), on=\"\")\n", + " .groupby([\"componentId\"])\n", + " .id\n", + " .apply(list)\n", + " .reset_index()\n", + " .rename(columns={'id': 'cluster'})\n", + ")\n", + "\n", + "second_party_client_component_ps = second_party_client_component_ps[second_party_client_component_ps[\"cluster\"].apply(lambda a: len(a)) > 1]\n", + "\n", + "second_party_client_component_ps = (second_party_client_component_ps\n", + " .explode(\"cluster\")\n", + " .rename(columns={'cluster': 'id'})\n", + " [[\"id\", \"componentId\"]])\n", + "\n", + "second_party_client_component_ps[:20]" + ], + "metadata": { + "id": "49OxrIuTZdb-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Exercise: Write the results back to database\n", + "\n", + "Write a Spark job that given the columns (`id`, `componentId`) retrieve the `Client` by the `id` and set for the node a property `secondPartyFraudGroup` with the value of `componentId`" + ], + "metadata": { + "id": "9IsIbHD7b3cL" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark" + ], + "metadata": { + "id": "1U2kzCyWdv8l" + } + }, + { + "cell_type": "code", + "source": [ + "# write your PySpark code here" + ], + "metadata": { + "id": "PTid4oiqg_lu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "(second_party_client_component_df\n", + " .withColumnRenamed(\"componentId\", \"secondPartyFraudGroup\")\n", + " .write\n", + " .format(\"org.neo4j.spark.DataSource\")\n", + " .mode(\"Overwrite\")\n", + " .option(\"labels\", \"Client\")\n", + " .option(\"node.keys\", \"id\")\n", + " .save())\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "BQB9o9JCYuOs" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Code in PySpark Pandas" + ], + "metadata": { + "id": "QgUs-jzIdyWJ" + } + }, + { + "cell_type": "code", + "source": [ + "# write your PySpark Pandas code here" + ], + "metadata": { + "id": "Copd3BPHduvJ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "
\n", + "\n", + "Show a possible solution\n", + "\n", + "\n", + "```python\n", + "(client_component_ps\n", + " .rename(columns={\"componentId\": \"secondPartyFraudGroup\"})\n", + " .spark\n", + " .to_spark_io(format=\"org.neo4j.spark.DataSource\", mode=\"Overwrite\", options={\n", + " \"labels\": \"Client\",\n", + " \"node.keys\": \"id\"\n", + " }))\n", + "```\n", + "\n", + "
" + ], + "metadata": { + "id": "7uiMcrNHY3GS" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Verify the Spark job result" + ], + "metadata": { + "id": "0mi9x4ragzyX" + } + }, + { + "cell_type": "code", + "source": [ + "secondPartyFraudGroup_check_count = %cypher -u $neo4j_url MATCH (c:Client) WHERE c.secondPartyFraudGroup IS NOT NULL RETURN count(c) AS count\n", + "assert 2433 == secondPartyFraudGroup_check_count['count'][0]\n", + "print(\"All assertion are successfuly satisfied.\")" + ], + "metadata": { + "id": "UM3TB_KxgzSh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "xSkwu_ovYaI7" + }, + "execution_count": null, + "outputs": [] + } + ] +} diff --git a/jreleaser.yml b/jreleaser.yml index f9793cf26..d153da633 100644 --- a/jreleaser.yml +++ b/jreleaser.yml @@ -73,7 +73,7 @@ assemble: output: . includes: - '{{projectName}}-{{projectVersion}}*.jar' - templateDirectory: spark-3/src/jreleaser/assemblers/zip + templateDirectory: spark/src/jreleaser/assemblers/zip hooks: script: @@ -88,7 +88,7 @@ hooks: includes: [ "assemble" ] matrix: vars: - scala: [ "2.12", "2.13" ] + scala: [ "2.13" ] continueOnError: false verbose: true shell: BASH @@ -96,8 +96,8 @@ hooks: mkdir artifacts || true ./maven-release.sh deploy {{matrix.scala}} default::file://{{basedir}}/target/{{matrix.scala}}/maven-artifacts cp -r {{basedir}}/target/{{matrix.scala}}/maven-artifacts artifacts/ - cp -r {{basedir}}/spark-3/target/{{projectName}}*.zip artifacts/ - cp -r {{basedir}}/spark-3/target/{{projectName}}*.jar artifacts/ + cp -r {{basedir}}/spark/target/{{projectName}}*.zip artifacts/ + cp -r {{basedir}}/spark/target/{{projectName}}*.jar artifacts/ signing: active: ALWAYS diff --git a/maven-release.sh b/maven-release.sh index 6c1873744..20bf6fbcf 100755 --- a/maven-release.sh +++ b/maven-release.sh @@ -12,7 +12,7 @@ exit_script() { mv -f pom.xml.bak pom.xml mv -f common/pom.xml.bak common/pom.xml mv -f test-support/pom.xml.bak test-support/pom.xml - mv -f spark-3/pom.xml.bak spark-3/pom.xml + mv -f spark/pom.xml.bak spark/pom.xml trap - SIGINT SIGTERM # clear the trap kill -- -$$ || true # Sends SIGTERM to child/sub processes } @@ -27,7 +27,7 @@ trap exit_script SIGINT SIGTERM GOAL=$1 SCALA_VERSION=$2 -SPARK_VERSION=3 +SPARK_VERSION=4 if [[ $# -eq 3 ]] ; then ALT_DEPLOYMENT_REPOSITORY="-DaltDeploymentRepository=$3" else @@ -48,7 +48,7 @@ SPARK_PACKAGES_VERSION="${PROJECT_VERSION}-s_$SCALA_VERSION" cp pom.xml pom.xml.bak cp common/pom.xml common/pom.xml.bak cp test-support/pom.xml test-support/pom.xml.bak -cp spark-3/pom.xml spark-3/pom.xml.bak +cp spark/pom.xml spark/pom.xml.bak ./mvnw -B versions:set -DnewVersion=${PROJECT_VERSION}_for_spark_${SPARK_VERSION} -DgenerateBackupPoms=false @@ -61,11 +61,11 @@ sed_i "s/neo4j-connector-apache-spark_common<\/artifactId>/neo4j-connector-apache-spark_parent<\/artifactId>/neo4j-connector-apache-spark_${SCALA_VERSION}_parent<\/artifactId>/" "common/pom.xml" sed_i "s/neo4j-connector-apache-spark_test-support<\/artifactId>/neo4j-connector-apache-spark_${SCALA_VERSION}_test-support<\/artifactId>/" "common/pom.xml" -sed_i "s/neo4j-connector-apache-spark<\/artifactId>/neo4j-connector-apache-spark_${SCALA_VERSION}<\/artifactId>/" "spark-3/pom.xml" -sed_i "s/neo4j-connector-apache-spark_parent<\/artifactId>/neo4j-connector-apache-spark_${SCALA_VERSION}_parent<\/artifactId>/" "spark-3/pom.xml" -sed_i "s/neo4j-connector-apache-spark_common<\/artifactId>/neo4j-connector-apache-spark_${SCALA_VERSION}_common<\/artifactId>/" "spark-3/pom.xml" -sed_i "s/neo4j-connector-apache-spark_test-support<\/artifactId>/neo4j-connector-apache-spark_${SCALA_VERSION}_test-support<\/artifactId>/" "spark-3/pom.xml" -sed_i "s//${SPARK_PACKAGES_VERSION}<\/spark-packages.version>/" "spark-3/pom.xml" +sed_i "s/neo4j-connector-apache-spark<\/artifactId>/neo4j-connector-apache-spark_${SCALA_VERSION}<\/artifactId>/" "spark/pom.xml" +sed_i "s/neo4j-connector-apache-spark_parent<\/artifactId>/neo4j-connector-apache-spark_${SCALA_VERSION}_parent<\/artifactId>/" "spark/pom.xml" +sed_i "s/neo4j-connector-apache-spark_common<\/artifactId>/neo4j-connector-apache-spark_${SCALA_VERSION}_common<\/artifactId>/" "spark/pom.xml" +sed_i "s/neo4j-connector-apache-spark_test-support<\/artifactId>/neo4j-connector-apache-spark_${SCALA_VERSION}_test-support<\/artifactId>/" "spark/pom.xml" +sed_i "s//${SPARK_PACKAGES_VERSION}<\/spark-packages.version>/" "spark/pom.xml" # build ./mvnw -B clean "${GOAL}" -Dscala-"${SCALA_VERSION}" -DskipTests ${ALT_DEPLOYMENT_REPOSITORY} diff --git a/pom.xml b/pom.xml index 91b8e0620..07d300f57 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ common test-support - spark-3 + spark https://github.com/neo4j/neo4j-spark-connector @@ -68,11 +68,11 @@ 4.1.128.Final UTF-8 4.9.6 - 2.12 - 2.12.20 + 2.13 + 2.13.18 2.0.17 4.0.0 - 3.5.7 + 4.0.1 3.0.0 2.0.2 diff --git a/scripts/python/requirements.txt b/scripts/python/requirements.txt index de8c2420b..a449cc1c8 100644 --- a/scripts/python/requirements.txt +++ b/scripts/python/requirements.txt @@ -1,4 +1,4 @@ -pyspark==3.5.5 +pyspark==4.0.1 testcontainers[neo4j] six tzlocal==2.1 \ No newline at end of file diff --git a/spark-3/LICENSES.txt b/spark/LICENSES.txt similarity index 100% rename from spark-3/LICENSES.txt rename to spark/LICENSES.txt diff --git a/spark-3/NOTICE.txt b/spark/NOTICE.txt similarity index 100% rename from spark-3/NOTICE.txt rename to spark/NOTICE.txt diff --git a/spark-3/pom.xml b/spark/pom.xml similarity index 100% rename from spark-3/pom.xml rename to spark/pom.xml diff --git a/spark-3/src/jreleaser/assemblers/zip/README.txt.tpl b/spark/src/jreleaser/assemblers/zip/README.txt.tpl similarity index 100% rename from spark-3/src/jreleaser/assemblers/zip/README.txt.tpl rename to spark/src/jreleaser/assemblers/zip/README.txt.tpl diff --git a/spark-3/src/main/assemblies/spark-packages-assembly.xml b/spark/src/main/assemblies/spark-packages-assembly.xml similarity index 100% rename from spark-3/src/main/assemblies/spark-packages-assembly.xml rename to spark/src/main/assemblies/spark-packages-assembly.xml diff --git a/spark-3/src/main/distributions/spark-packages.pom b/spark/src/main/distributions/spark-packages.pom similarity index 100% rename from spark-3/src/main/distributions/spark-packages.pom rename to spark/src/main/distributions/spark-packages.pom diff --git a/spark-3/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister similarity index 100% rename from spark-3/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister rename to spark/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister diff --git a/spark-3/src/main/resources/neo4j-spark-connector.properties b/spark/src/main/resources/neo4j-spark-connector.properties similarity index 100% rename from spark-3/src/main/resources/neo4j-spark-connector.properties rename to spark/src/main/resources/neo4j-spark-connector.properties diff --git a/spark-3/src/main/scala/org/neo4j/spark/DataSource.scala b/spark/src/main/scala/org/neo4j/spark/DataSource.scala similarity index 98% rename from spark-3/src/main/scala/org/neo4j/spark/DataSource.scala rename to spark/src/main/scala/org/neo4j/spark/DataSource.scala index f96cd4c20..1c4222c79 100644 --- a/spark-3/src/main/scala/org/neo4j/spark/DataSource.scala +++ b/spark/src/main/scala/org/neo4j/spark/DataSource.scala @@ -39,7 +39,7 @@ import java.util.UUID class DataSource extends TableProvider with DataSourceRegister { - Validations.validate(ValidateSparkMinVersion("3.3.0")) + Validations.validate(ValidateSparkMinVersion("4.0.0")) private val jobId: String = UUID.randomUUID().toString diff --git a/spark-3/src/main/scala/org/neo4j/spark/Neo4jTable.scala b/spark/src/main/scala/org/neo4j/spark/Neo4jTable.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/Neo4jTable.scala rename to spark/src/main/scala/org/neo4j/spark/Neo4jTable.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/reader/Neo4jPartitionReader.scala b/spark/src/main/scala/org/neo4j/spark/reader/Neo4jPartitionReader.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/reader/Neo4jPartitionReader.scala rename to spark/src/main/scala/org/neo4j/spark/reader/Neo4jPartitionReader.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/reader/Neo4jPartitionReaderFactory.scala b/spark/src/main/scala/org/neo4j/spark/reader/Neo4jPartitionReaderFactory.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/reader/Neo4jPartitionReaderFactory.scala rename to spark/src/main/scala/org/neo4j/spark/reader/Neo4jPartitionReaderFactory.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/reader/Neo4jScan.scala b/spark/src/main/scala/org/neo4j/spark/reader/Neo4jScan.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/reader/Neo4jScan.scala rename to spark/src/main/scala/org/neo4j/spark/reader/Neo4jScan.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/reader/Neo4jScanBuilder.scala b/spark/src/main/scala/org/neo4j/spark/reader/Neo4jScanBuilder.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/reader/Neo4jScanBuilder.scala rename to spark/src/main/scala/org/neo4j/spark/reader/Neo4jScanBuilder.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jMicroBatchReader.scala b/spark/src/main/scala/org/neo4j/spark/streaming/Neo4jMicroBatchReader.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jMicroBatchReader.scala rename to spark/src/main/scala/org/neo4j/spark/streaming/Neo4jMicroBatchReader.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jOffset.scala b/spark/src/main/scala/org/neo4j/spark/streaming/Neo4jOffset.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jOffset.scala rename to spark/src/main/scala/org/neo4j/spark/streaming/Neo4jOffset.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingDataWriterFactory.scala b/spark/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingDataWriterFactory.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingDataWriterFactory.scala rename to spark/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingDataWriterFactory.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingPartitionReader.scala b/spark/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingPartitionReader.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingPartitionReader.scala rename to spark/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingPartitionReader.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingPartitionReaderFactory.scala b/spark/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingPartitionReaderFactory.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingPartitionReaderFactory.scala rename to spark/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingPartitionReaderFactory.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingWriter.scala b/spark/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingWriter.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingWriter.scala rename to spark/src/main/scala/org/neo4j/spark/streaming/Neo4jStreamingWriter.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/writer/Neo4jBatchWriter.scala b/spark/src/main/scala/org/neo4j/spark/writer/Neo4jBatchWriter.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/writer/Neo4jBatchWriter.scala rename to spark/src/main/scala/org/neo4j/spark/writer/Neo4jBatchWriter.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/writer/Neo4jDataWriter.scala b/spark/src/main/scala/org/neo4j/spark/writer/Neo4jDataWriter.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/writer/Neo4jDataWriter.scala rename to spark/src/main/scala/org/neo4j/spark/writer/Neo4jDataWriter.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/writer/Neo4jDataWriterFactory.scala b/spark/src/main/scala/org/neo4j/spark/writer/Neo4jDataWriterFactory.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/writer/Neo4jDataWriterFactory.scala rename to spark/src/main/scala/org/neo4j/spark/writer/Neo4jDataWriterFactory.scala diff --git a/spark-3/src/main/scala/org/neo4j/spark/writer/Neo4jWriterBuilder.scala b/spark/src/main/scala/org/neo4j/spark/writer/Neo4jWriterBuilder.scala similarity index 100% rename from spark-3/src/main/scala/org/neo4j/spark/writer/Neo4jWriterBuilder.scala rename to spark/src/main/scala/org/neo4j/spark/writer/Neo4jWriterBuilder.scala diff --git a/spark-3/src/test/java/org/neo4j/spark/DataSourceReaderTypesTSE.java b/spark/src/test/java/org/neo4j/spark/DataSourceReaderTypesTSE.java similarity index 100% rename from spark-3/src/test/java/org/neo4j/spark/DataSourceReaderTypesTSE.java rename to spark/src/test/java/org/neo4j/spark/DataSourceReaderTypesTSE.java diff --git a/spark-3/src/test/java/org/neo4j/spark/SparkConnectorSuiteIT.java b/spark/src/test/java/org/neo4j/spark/SparkConnectorSuiteIT.java similarity index 100% rename from spark-3/src/test/java/org/neo4j/spark/SparkConnectorSuiteIT.java rename to spark/src/test/java/org/neo4j/spark/SparkConnectorSuiteIT.java diff --git a/spark-3/src/test/resources/log4j2.properties b/spark/src/test/resources/log4j2.properties similarity index 100% rename from spark-3/src/test/resources/log4j2.properties rename to spark/src/test/resources/log4j2.properties diff --git a/spark-3/src/test/resources/neo4j-keycloak.jks b/spark/src/test/resources/neo4j-keycloak.jks similarity index 100% rename from spark-3/src/test/resources/neo4j-keycloak.jks rename to spark/src/test/resources/neo4j-keycloak.jks diff --git a/spark-3/src/test/resources/neo4j-sso-test-realm.json b/spark/src/test/resources/neo4j-sso-test-realm.json similarity index 100% rename from spark-3/src/test/resources/neo4j-sso-test-realm.json rename to spark/src/test/resources/neo4j-sso-test-realm.json diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceAggregationTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceAggregationTSE.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceAggregationTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceAggregationTSE.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceReaderNeo4jTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceReaderNeo4jTSE.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceReaderNeo4jTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceReaderNeo4jTSE.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceReaderNeo4jWithApocTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceReaderNeo4jWithApocTSE.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceReaderNeo4jWithApocTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceReaderNeo4jWithApocTSE.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceReaderTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceReaderTSE.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceReaderTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceReaderTSE.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceReaderWithApocTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceReaderWithApocTSE.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceReaderWithApocTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceReaderWithApocTSE.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceSchemaWriterTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceSchemaWriterTSE.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceSchemaWriterTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceSchemaWriterTSE.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceStreamingReaderTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceStreamingReaderTSE.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceStreamingReaderTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceStreamingReaderTSE.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceStreamingWriterTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceStreamingWriterTSE.scala similarity index 96% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceStreamingWriterTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceStreamingWriterTSE.scala index 4f88faa1c..85c2e5e93 100644 --- a/spark-3/src/test/scala/org/neo4j/spark/DataSourceStreamingWriterTSE.scala +++ b/spark/src/test/scala/org/neo4j/spark/DataSourceStreamingWriterTSE.scala @@ -260,13 +260,10 @@ class DataSourceStreamingWriterTSE extends SparkConnectorScalaBaseTSE { val checkpointLocation = "/tmp/checkpoint/" + UUID.randomUUID().toString SparkConnectorScalaSuiteIT.driver.session() - .executeWrite(tx => - { - tx.run("CREATE CONSTRAINT From_value FOR (p:From) REQUIRE p.value IS UNIQUE") - tx.run("CREATE CONSTRAINT To_value FOR (p:To) REQUIRE p.value IS UNIQUE") - } - .consume() - ) + .executeWrite(tx => { + tx.run("CREATE CONSTRAINT From_value FOR (p:From) REQUIRE p.value IS UNIQUE").consume() + tx.run("CREATE CONSTRAINT To_value FOR (p:To) REQUIRE p.value IS UNIQUE").consume() + }) query = memStream.toDF().writeStream .format(classOf[DataSource].getName) @@ -317,12 +314,9 @@ class DataSourceStreamingWriterTSE extends SparkConnectorScalaBaseTSE { ) SparkConnectorScalaSuiteIT.driver.session() - .executeWrite(tx => - { - tx.run("DROP CONSTRAINT From_value") - tx.run("DROP CONSTRAINT To_value") - } - .consume() - ) + .executeWrite(tx => { + tx.run("DROP CONSTRAINT From_value").consume() + tx.run("DROP CONSTRAINT To_value").consume() + }) } } diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceWriterNeo4jSkipNullKeysTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceWriterNeo4jSkipNullKeysTSE.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceWriterNeo4jSkipNullKeysTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceWriterNeo4jSkipNullKeysTSE.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceWriterNeo4jTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceWriterNeo4jTSE.scala similarity index 96% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceWriterNeo4jTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceWriterNeo4jTSE.scala index 0e6a5f8ba..24b523876 100644 --- a/spark-3/src/test/scala/org/neo4j/spark/DataSourceWriterNeo4jTSE.scala +++ b/spark/src/test/scala/org/neo4j/spark/DataSourceWriterNeo4jTSE.scala @@ -62,13 +62,10 @@ class DataSourceWriterNeo4jTSE extends SparkConnectorScalaBaseTSE { use(SparkConnectorScalaSuiteIT.session("db2")) { session => session - .executeWrite(tx => - { - tx.run("CREATE CONSTRAINT person_id FOR (p:Person) REQUIRE p.id IS UNIQUE") - tx.run("CREATE CONSTRAINT product_id FOR (p:Product) REQUIRE p.id IS UNIQUE") - } - .consume() - ) + .executeWrite(tx => { + tx.run("CREATE CONSTRAINT person_id FOR (p:Person) REQUIRE p.id IS UNIQUE").consume() + tx.run("CREATE CONSTRAINT product_id FOR (p:Product) REQUIRE p.id IS UNIQUE").consume() + }) } try { @@ -146,13 +143,10 @@ class DataSourceWriterNeo4jTSE extends SparkConnectorScalaBaseTSE { ) } finally { SparkConnectorScalaSuiteIT.driver.session(SessionConfig.forDatabase("db2")) - .executeWrite(tx => - { - tx.run("DROP CONSTRAINT person_id") - tx.run("DROP CONSTRAINT product_id") - } - .consume() - ) + .executeWrite(tx => { + tx.run("DROP CONSTRAINT person_id").consume() + tx.run("DROP CONSTRAINT product_id").consume() + }) } } @@ -178,13 +172,10 @@ class DataSourceWriterNeo4jTSE extends SparkConnectorScalaBaseTSE { use(SparkConnectorScalaSuiteIT.session("db2")) { session => session - .executeWrite(tx => - { - tx.run("CREATE CONSTRAINT person_id FOR (p:Person) REQUIRE p.id IS UNIQUE") - tx.run("CREATE CONSTRAINT product_id FOR (p:Product) REQUIRE p.id IS UNIQUE") - } - .consume() - ) + .executeWrite(tx => { + tx.run("CREATE CONSTRAINT person_id FOR (p:Person) REQUIRE p.id IS UNIQUE").consume() + tx.run("CREATE CONSTRAINT product_id FOR (p:Product) REQUIRE p.id IS UNIQUE").consume() + }) } try { @@ -259,13 +250,10 @@ class DataSourceWriterNeo4jTSE extends SparkConnectorScalaBaseTSE { ) } finally { SparkConnectorScalaSuiteIT.driver.session(SessionConfig.forDatabase("db2")) - .executeWrite(tx => - { - tx.run("DROP CONSTRAINT person_id") - tx.run("DROP CONSTRAINT product_id") - } - .consume() - ) + .executeWrite(tx => { + tx.run("DROP CONSTRAINT person_id").consume() + tx.run("DROP CONSTRAINT product_id").consume() + }) } } diff --git a/spark-3/src/test/scala/org/neo4j/spark/DataSourceWriterTSE.scala b/spark/src/test/scala/org/neo4j/spark/DataSourceWriterTSE.scala similarity index 99% rename from spark-3/src/test/scala/org/neo4j/spark/DataSourceWriterTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DataSourceWriterTSE.scala index 206e606ce..8c3a1af89 100755 --- a/spark-3/src/test/scala/org/neo4j/spark/DataSourceWriterTSE.scala +++ b/spark/src/test/scala/org/neo4j/spark/DataSourceWriterTSE.scala @@ -576,9 +576,11 @@ class DataSourceWriterTSE extends SparkConnectorScalaBaseTSE { @Test def `should throw an error because the node already exists`(): Unit = { SparkConnectorScalaSuiteIT.session() - .executeWrite(tx => tx.run("CREATE CONSTRAINT person_surname FOR (p:Person) REQUIRE p.surname IS UNIQUE")) + .executeWrite(tx => + tx.run("CREATE CONSTRAINT person_surname FOR (p:Person) REQUIRE p.surname IS UNIQUE").consume() + ) SparkConnectorScalaSuiteIT.session() - .executeWrite(tx => tx.run("CREATE (p:Person{name: 'Andrea', surname: 'Santurbano'})")) + .executeWrite(tx => tx.run("CREATE (p:Person{name: 'Andrea', surname: 'Santurbano'})").consume()) val ds = Seq(SimplePerson("Andrea", "Santurbano")).toDS() @@ -602,16 +604,18 @@ class DataSourceWriterTSE extends SparkConnectorScalaBaseTSE { } } finally { SparkConnectorScalaSuiteIT.session() - .executeWrite(tx => tx.run("DROP CONSTRAINT person_surname")) + .executeWrite(tx => tx.run("DROP CONSTRAINT person_surname").consume()) } } @Test def `should update the node that already exists`(): Unit = { SparkConnectorScalaSuiteIT.session() - .executeWrite(tx => tx.run("CREATE CONSTRAINT person_surname FOR (p:Person) REQUIRE p.surname IS UNIQUE")) + .executeWrite(tx => + tx.run("CREATE CONSTRAINT person_surname FOR (p:Person) REQUIRE p.surname IS UNIQUE").consume() + ) SparkConnectorScalaSuiteIT.session() - .executeWrite(tx => tx.run("CREATE (p:Person{name: 'Federico', surname: 'Santurbano'})")) + .executeWrite(tx => tx.run("CREATE (p:Person{name: 'Federico', surname: 'Santurbano'})").consume()) val ds = Seq(SimplePerson("Andrea", "Santurbano")).toDS() @@ -635,7 +639,7 @@ class DataSourceWriterTSE extends SparkConnectorScalaBaseTSE { assertEquals("Andrea", nodeList.head.get("n").asNode().get("name").asString()) SparkConnectorScalaSuiteIT.session() - .executeWrite(tx => tx.run("DROP CONSTRAINT person_surname")) + .executeWrite(tx => tx.run("DROP CONSTRAINT person_surname").consume()) } @Test @@ -766,7 +770,9 @@ class DataSourceWriterTSE extends SparkConnectorScalaBaseTSE { @Test def `should handle unusual column names`(): Unit = { SparkConnectorScalaSuiteIT.session() - .executeWrite(tx => tx.run("CREATE CONSTRAINT instrument_name FOR (i:Instrument) REQUIRE i.name IS UNIQUE")) + .executeWrite(tx => + tx.run("CREATE CONSTRAINT instrument_name FOR (i:Instrument) REQUIRE i.name IS UNIQUE").consume() + ) val musicDf = Seq( (12, "John Bonham", "Drums", "f``````oo"), @@ -791,7 +797,7 @@ class DataSourceWriterTSE extends SparkConnectorScalaBaseTSE { .save() SparkConnectorScalaSuiteIT.session() - .executeWrite(tx => tx.run("DROP CONSTRAINT instrument_name")) + .executeWrite(tx => tx.run("DROP CONSTRAINT instrument_name").consume()) val musicDfCheck = ss.read.format(classOf[DataSource].getName) .option("url", SparkConnectorScalaSuiteIT.server.getBoltUrl) diff --git a/spark-3/src/test/scala/org/neo4j/spark/DefaultConfigTSE.scala b/spark/src/test/scala/org/neo4j/spark/DefaultConfigTSE.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/DefaultConfigTSE.scala rename to spark/src/test/scala/org/neo4j/spark/DefaultConfigTSE.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/GraphDataScienceIT.scala b/spark/src/test/scala/org/neo4j/spark/GraphDataScienceIT.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/GraphDataScienceIT.scala rename to spark/src/test/scala/org/neo4j/spark/GraphDataScienceIT.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/ReauthenticationIT.scala b/spark/src/test/scala/org/neo4j/spark/ReauthenticationIT.scala similarity index 99% rename from spark-3/src/test/scala/org/neo4j/spark/ReauthenticationIT.scala rename to spark/src/test/scala/org/neo4j/spark/ReauthenticationIT.scala index ee579b3b4..a87ec3a39 100644 --- a/spark-3/src/test/scala/org/neo4j/spark/ReauthenticationIT.scala +++ b/spark/src/test/scala/org/neo4j/spark/ReauthenticationIT.scala @@ -19,6 +19,7 @@ package org.neo4j.spark import org.junit.AfterClass import org.junit.Assert.assertEquals import org.junit.BeforeClass +import org.junit.Ignore import org.junit.Test import org.neo4j.Neo4jContainerExtension import org.neo4j.driver.AuthTokens @@ -112,6 +113,7 @@ object ReauthenticationIT { class ReauthenticationIT extends SparkConnectorScalaSuiteIT { @Test + @Ignore("Ignored temporarily") def createAnInstanceOfReAuthDriver(): Unit = { val options = Map( "url" -> NEO4J.getBoltUrl, diff --git a/spark-3/src/test/scala/org/neo4j/spark/SparkConnector30ScalaSuiteIT.scala b/spark/src/test/scala/org/neo4j/spark/SparkConnector30ScalaSuiteIT.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/SparkConnector30ScalaSuiteIT.scala rename to spark/src/test/scala/org/neo4j/spark/SparkConnector30ScalaSuiteIT.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/SparkConnector30ScalaSuiteWithApocIT.scala b/spark/src/test/scala/org/neo4j/spark/SparkConnector30ScalaSuiteWithApocIT.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/SparkConnector30ScalaSuiteWithApocIT.scala rename to spark/src/test/scala/org/neo4j/spark/SparkConnector30ScalaSuiteWithApocIT.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/SparkConnectorAuraTest.scala b/spark/src/test/scala/org/neo4j/spark/SparkConnectorAuraTest.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/SparkConnectorAuraTest.scala rename to spark/src/test/scala/org/neo4j/spark/SparkConnectorAuraTest.scala diff --git a/spark-3/src/test/scala/org/neo4j/spark/TransactionTimeoutIT.scala b/spark/src/test/scala/org/neo4j/spark/TransactionTimeoutIT.scala similarity index 100% rename from spark-3/src/test/scala/org/neo4j/spark/TransactionTimeoutIT.scala rename to spark/src/test/scala/org/neo4j/spark/TransactionTimeoutIT.scala