Training GDS First Party Fraud Detection - tomgeudens/practical-neo4j GitHub Wiki

Context: Cut-and-paste commands for the Graph Data Science - First Party Fraud session.

Prerequisite: This document will assume you have a Neo4j instance running and are connected to it with the Neo4j Browser. You also need to have the Payment Simulation database loaded.

Quick overview

001

// Make sure you're in the correct database (paysim)
:use paysim

002

// Visualize the model
CALL db.schema.visualization();

Get to know the data

003

// Full graph counts
CALL apoc.meta.stats();

004

// Node (label) frequencies
CALL db.labels() YIELD label
CALL apoc.cypher.run('MATCH (:`'+label+'`) RETURN count(*) as freq', {})
YIELD value
WITH label, value.freq AS freq
CALL apoc.meta.stats() YIELD nodeCount
WITH *, 3 AS precision
WITH *, 10^precision AS factor, toFloat(freq)/toFloat(nodeCount) AS relFreq
RETURN label AS nodeLabel, freq AS frequency,
round(relFreq*factor)/factor AS relativeFrequency
ORDER BY freq DESC;

005

// Relationship (type) frequencies
CALL db.relationshipTypes() YIELD relationshipType as type
CALL apoc.cypher.run('MATCH ()-[:`'+type+'`]->() RETURN count(*) as freq', {})
YIELD value
WITH type AS relationshipType, value.freq AS freq
CALL apoc.meta.stats() YIELD relCount
WITH *, 3 AS precision
WITH *, 10^precision AS factor, toFloat(freq)/toFloat(relCount) as relFreq
RETURN relationshipType, freq AS frequency,
round(relFreq*factor)/factor AS relativeFrequency
ORDER BY freq DESC;

006

// Property types
CALL apoc.meta.data() YIELD label, property, type, elementType
WHERE type <> 'RELATIONSHIP'
RETURN elementType,label, property, type
ORDER BY elementType, label, property;

007

// Value Proposition
MATCH (t:Transaction)
WITH sum(t.amount) AS globalSum, count(t) AS globalCnt
WITH *, 10^3 AS scaleFactor
UNWIND ['CashIn', 'CashOut', 'Payment', 'Debit', 'Transfer'] AS txType
CALL apoc.cypher.run('MATCH (t:' + txType + ') RETURN sum(t.amount) as txAmount, count(t) AS txCnt', {})
YIELD value
RETURN txType,value.txAmount AS TotalMarketValue,
  100*round(scaleFactor*(toFloat(value.txAmount)/toFloat(globalSum))) /scaleFactor AS `%MarketValue`,
  100*round(scaleFactor*(toFloat(value.txCnt)/toFloat(globalCnt))) /scaleFactor AS `%MarketTransactions`,
  toInteger(toFloat(value.txAmount)/toFloat(value.txCnt)) AS AvgTransactionValue,
  value.txCnt AS NumberOfTransactions
ORDER BY `%MarketTransactions` DESC;

Reshaping the graph

008

// Identify pairs of clients sharing PII
MATCH (c1:Client)-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]->(n)<-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]-(c2:Client)
WHERE c1.id <> c2.id
RETURN c1.id, c2.id, count(*) AS freq
ORDER BY freq DESC;

009

// How many potential "clients with issues" do we have?
MATCH (c1:Client)-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]->()<-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]-(c2:Client)
WHERE c1.id <> c2.id
RETURN count(DISTINCT c1.id) AS potential;

010

// Create the SHARED_IDENTIFIERS relationship
CALL apoc.periodic.iterate(
"MATCH (c1:Client)
 RETURN c1.id as c1id;",
"MATCH (c1:Client)-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]->(n)<-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]-(c2:Client)
 WHERE c1.id = c1id 
 AND c1.id <> c2.id
 WITH c1, c2, count(*) as cnt
 MERGE (c1)-[:SHARED_IDENTIFIERS {count: cnt}]->(c2);",
{batchSize:2000,parallel:false});

011

// Visualize the newly created relationship
MATCH p=(c:Client)-[s:SHARED_IDENTIFIERS]-() 
WHERE s.count >= 2 
RETURN p limit 25;

Finding likely targets

012

// Memory estimation of our projection
CALL gds.graph.create.cypher.estimate(
'MATCH (c:Client) RETURN id(c) AS id',
'MATCH (c1:Client)-[r:SHARED_IDENTIFIERS]->(c2:Client)
 RETURN id(c1) AS source, id(c2) AS target, r.count AS weight')
YIELD requiredMemory,nodeCount,relationshipCount;

013

// Find relics (if any)
CALL gds.graph.list();

014

// Remove relics (if any)
CALL gds.graph.drop('name-of-your-relic-graph');

015

// Project the graph into the workspace
CALL gds.graph.create('WCC', 
'Client',
{
    SHARED_IDENTIFIERS:{
        type: 'SHARED_IDENTIFIERS',
        properties: {
            count: {
                property: 'count'
            }
        }
    }
}
) YIELD graphName,nodeCount,relationshipCount,createMillis;

016

// Estimate algorithm memory usage
CALL gds.wcc.stream.estimate('WCC');

017

// Do the statistics make sense?
CALL gds.wcc.stats('WCC');

018

// Run in streaming mode
CALL gds.wcc.stream('WCC')
YIELD componentId, nodeId
WITH componentId AS cluster, gds.util.asNode(nodeId) AS client
WITH cluster, collect(client.id) AS clients
WITH *, size(clients) AS clusterSize
WHERE clusterSize > 1
RETURN cluster, clusterSize,clients ORDER by clusterSize DESC;

019

// Write back to the database (in streaming mode???)
CALL gds.wcc.stream('WCC')
YIELD componentId, nodeId
WITH componentId AS cluster, gds.util.asNode(nodeId) AS client
WITH cluster, collect(client.id) AS clients
WITH *, size(clients) AS clusterSize
WHERE clusterSize > 1
UNWIND clients AS client
MATCH (c:Client)
WHERE c.id = client
SET c.firstPartyFraudGroup=cluster;

020

// Visualize the results
MATCH (c:Client)
WITH c.firstPartyFraudGroup AS fpGroupID, collect(c.id) AS fGroup
WITH *, size(fGroup) AS groupSize WHERE groupSize >= 9
WITH collect(fpGroupID) AS fraudRings
MATCH p=(c:Client)-[:HAS_SSN|HAS_EMAIL|HAS_PHONE]->()
WHERE c.firstPartyFraudGroup IN fraudRings
RETURN p;

Duck Tales

021

// Project the bipartite graph of "just" our targets
CALL gds.graph.create.cypher('Similarity',
'MATCH(c:Client)
 WHERE c.firstPartyFraudGroup IS NOT NULL
 RETURN id(c) AS id,labels(c) AS labels
 UNION
 MATCH(n)
 WHERE n:Email OR n:Phone OR n:SSN
 RETURN id(n) AS id,labels(n) AS labels',
'MATCH(c:Client)-[r:HAS_EMAIL|HAS_PHONE|HAS_SSN]->(ids)
 WHERE c.firstPartyFraudGroup IS NOT NULL
 RETURN id(c) AS source,id(ids) AS target, type(r) as type')
 YIELD graphName,nodeCount,relationshipCount,createMillis;

022

// Compute similarity and mutate the projection!
CALL gds.nodeSimilarity.mutate('Similarity',{topK:15,
  mutateProperty:'jaccardScore', mutateRelationshipType:'SIMILAR_TO'});

023

// Verify the mutation of the projection
CALL gds.graph.list('Similarity');

The man in the middle

024

// Run degree centrality in streaming mode
CALL gds.degree.stream('Similarity',
{
    nodeLabels: ['Client'],
    relationshipTypes: ['SIMILAR_TO'],
    relationshipWeightProperty:'jaccardScore'
}) YIELD nodeId,score
RETURN gds.util.asNode(nodeId).id AS client,score
ORDER BY score DESC;

025

// Write back to the graph 
CALL gds.degree.write('Similarity',{nodeLabels:['Client'],
    relationshipTypes:['SIMILAR_TO'],
    relationshipWeightProperty:'jaccardScore',
    writeProperty:'firstPartyFraudScore'});

026

// Enhance the graph
MATCH(c:Client)
WHERE exists(c.firstPartyFraudScore)
WITH percentileCont(c.firstPartyFraudScore, 0.95)
    AS firstPartyFraudThreshold
MATCH(c:Client)
WHERE c.firstPartyFraudScore>firstPartyFraudThreshold
SET c:FirstPartyFraudster;

Cleanup

027

// Cleanup
CALL gds.graph.list()
YIELD graphName AS namedGraph
WITH namedGraph
CALL gds.graph.drop(namedGraph)
YIELD graphName
RETURN graphName;