Training GDS First Party Fraud Detection - tomgeudens/practical-neo4j GitHub Wiki
Context: Cut-and-paste commands for the Graph Data Science - First Party Fraud session.
Prerequisite: This document will assume you have a Neo4j instance running and are connected to it with the Neo4j Browser. You also need to have the Payment Simulation database loaded.
Quick overview
001
// Make sure you're in the correct database (paysim)
:use paysim
002
// Visualize the model
CALL db.schema.visualization();
Get to know the data
003
// Full graph counts
CALL apoc.meta.stats();
004
// Node (label) frequencies
CALL db.labels() YIELD label
CALL apoc.cypher.run('MATCH (:`'+label+'`) RETURN count(*) as freq', {})
YIELD value
WITH label, value.freq AS freq
CALL apoc.meta.stats() YIELD nodeCount
WITH *, 3 AS precision
WITH *, 10^precision AS factor, toFloat(freq)/toFloat(nodeCount) AS relFreq
RETURN label AS nodeLabel, freq AS frequency,
round(relFreq*factor)/factor AS relativeFrequency
ORDER BY freq DESC;
005
// Relationship (type) frequencies
CALL db.relationshipTypes() YIELD relationshipType as type
CALL apoc.cypher.run('MATCH ()-[:`'+type+'`]->() RETURN count(*) as freq', {})
YIELD value
WITH type AS relationshipType, value.freq AS freq
CALL apoc.meta.stats() YIELD relCount
WITH *, 3 AS precision
WITH *, 10^precision AS factor, toFloat(freq)/toFloat(relCount) as relFreq
RETURN relationshipType, freq AS frequency,
round(relFreq*factor)/factor AS relativeFrequency
ORDER BY freq DESC;
006
// Property types
CALL apoc.meta.data() YIELD label, property, type, elementType
WHERE type <> 'RELATIONSHIP'
RETURN elementType,label, property, type
ORDER BY elementType, label, property;
007
// Value Proposition
MATCH (t:Transaction)
WITH sum(t.amount) AS globalSum, count(t) AS globalCnt
WITH *, 10^3 AS scaleFactor
UNWIND ['CashIn', 'CashOut', 'Payment', 'Debit', 'Transfer'] AS txType
CALL apoc.cypher.run('MATCH (t:' + txType + ') RETURN sum(t.amount) as txAmount, count(t) AS txCnt', {})
YIELD value
RETURN txType,value.txAmount AS TotalMarketValue,
100*round(scaleFactor*(toFloat(value.txAmount)/toFloat(globalSum))) /scaleFactor AS `%MarketValue`,
100*round(scaleFactor*(toFloat(value.txCnt)/toFloat(globalCnt))) /scaleFactor AS `%MarketTransactions`,
toInteger(toFloat(value.txAmount)/toFloat(value.txCnt)) AS AvgTransactionValue,
value.txCnt AS NumberOfTransactions
ORDER BY `%MarketTransactions` DESC;
Reshaping the graph
008
// Identify pairs of clients sharing PII
MATCH (c1:Client)-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]->(n)<-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]-(c2:Client)
WHERE c1.id <> c2.id
RETURN c1.id, c2.id, count(*) AS freq
ORDER BY freq DESC;
009
// How many potential "clients with issues" do we have?
MATCH (c1:Client)-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]->()<-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]-(c2:Client)
WHERE c1.id <> c2.id
RETURN count(DISTINCT c1.id) AS potential;
010
// Create the SHARED_IDENTIFIERS relationship
CALL apoc.periodic.iterate(
"MATCH (c1:Client)
RETURN c1.id as c1id;",
"MATCH (c1:Client)-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]->(n)<-[:HAS_EMAIL|HAS_PHONE|HAS_SSN]-(c2:Client)
WHERE c1.id = c1id
AND c1.id <> c2.id
WITH c1, c2, count(*) as cnt
MERGE (c1)-[:SHARED_IDENTIFIERS {count: cnt}]->(c2);",
{batchSize:2000,parallel:false});
011
// Visualize the newly created relationship
MATCH p=(c:Client)-[s:SHARED_IDENTIFIERS]-()
WHERE s.count >= 2
RETURN p limit 25;
Finding likely targets
012
// Memory estimation of our projection
CALL gds.graph.create.cypher.estimate(
'MATCH (c:Client) RETURN id(c) AS id',
'MATCH (c1:Client)-[r:SHARED_IDENTIFIERS]->(c2:Client)
RETURN id(c1) AS source, id(c2) AS target, r.count AS weight')
YIELD requiredMemory,nodeCount,relationshipCount;
013
// Find relics (if any)
CALL gds.graph.list();
014
// Remove relics (if any)
CALL gds.graph.drop('name-of-your-relic-graph');
015
// Project the graph into the workspace
CALL gds.graph.create('WCC',
'Client',
{
SHARED_IDENTIFIERS:{
type: 'SHARED_IDENTIFIERS',
properties: {
count: {
property: 'count'
}
}
}
}
) YIELD graphName,nodeCount,relationshipCount,createMillis;
016
// Estimate algorithm memory usage
CALL gds.wcc.stream.estimate('WCC');
017
// Do the statistics make sense?
CALL gds.wcc.stats('WCC');
018
// Run in streaming mode
CALL gds.wcc.stream('WCC')
YIELD componentId, nodeId
WITH componentId AS cluster, gds.util.asNode(nodeId) AS client
WITH cluster, collect(client.id) AS clients
WITH *, size(clients) AS clusterSize
WHERE clusterSize > 1
RETURN cluster, clusterSize,clients ORDER by clusterSize DESC;
019
// Write back to the database (in streaming mode???)
CALL gds.wcc.stream('WCC')
YIELD componentId, nodeId
WITH componentId AS cluster, gds.util.asNode(nodeId) AS client
WITH cluster, collect(client.id) AS clients
WITH *, size(clients) AS clusterSize
WHERE clusterSize > 1
UNWIND clients AS client
MATCH (c:Client)
WHERE c.id = client
SET c.firstPartyFraudGroup=cluster;
020
// Visualize the results
MATCH (c:Client)
WITH c.firstPartyFraudGroup AS fpGroupID, collect(c.id) AS fGroup
WITH *, size(fGroup) AS groupSize WHERE groupSize >= 9
WITH collect(fpGroupID) AS fraudRings
MATCH p=(c:Client)-[:HAS_SSN|HAS_EMAIL|HAS_PHONE]->()
WHERE c.firstPartyFraudGroup IN fraudRings
RETURN p;
Duck Tales
021
// Project the bipartite graph of "just" our targets
CALL gds.graph.create.cypher('Similarity',
'MATCH(c:Client)
WHERE c.firstPartyFraudGroup IS NOT NULL
RETURN id(c) AS id,labels(c) AS labels
UNION
MATCH(n)
WHERE n:Email OR n:Phone OR n:SSN
RETURN id(n) AS id,labels(n) AS labels',
'MATCH(c:Client)-[r:HAS_EMAIL|HAS_PHONE|HAS_SSN]->(ids)
WHERE c.firstPartyFraudGroup IS NOT NULL
RETURN id(c) AS source,id(ids) AS target, type(r) as type')
YIELD graphName,nodeCount,relationshipCount,createMillis;
022
// Compute similarity and mutate the projection!
CALL gds.nodeSimilarity.mutate('Similarity',{topK:15,
mutateProperty:'jaccardScore', mutateRelationshipType:'SIMILAR_TO'});
023
// Verify the mutation of the projection
CALL gds.graph.list('Similarity');
The man in the middle
024
// Run degree centrality in streaming mode
CALL gds.degree.stream('Similarity',
{
nodeLabels: ['Client'],
relationshipTypes: ['SIMILAR_TO'],
relationshipWeightProperty:'jaccardScore'
}) YIELD nodeId,score
RETURN gds.util.asNode(nodeId).id AS client,score
ORDER BY score DESC;
025
// Write back to the graph
CALL gds.degree.write('Similarity',{nodeLabels:['Client'],
relationshipTypes:['SIMILAR_TO'],
relationshipWeightProperty:'jaccardScore',
writeProperty:'firstPartyFraudScore'});
026
// Enhance the graph
MATCH(c:Client)
WHERE exists(c.firstPartyFraudScore)
WITH percentileCont(c.firstPartyFraudScore, 0.95)
AS firstPartyFraudThreshold
MATCH(c:Client)
WHERE c.firstPartyFraudScore>firstPartyFraudThreshold
SET c:FirstPartyFraudster;
Cleanup
027
// Cleanup
CALL gds.graph.list()
YIELD graphName AS namedGraph
WITH namedGraph
CALL gds.graph.drop(namedGraph)
YIELD graphName
RETURN graphName;