SlideShare una empresa de Scribd logo
1 de 57
Descargar para leer sin conexión
TEACHING YOUR
MACHINE
TO FIND
FRAUDSTERS

Ian Barber
ianb@php.net
phpir.com
twitter.com/ianbarber
http://joind.in/3429




https://github.com/ianbarber/FindingFraudsters-Talk
5%
           3%
SOME      .1%
SMALL
NUMBERS    8%
99%
ACCURACY
REALLY     REALLY
             LEGITIMATE   FRAUD


EVALUATED
                989         0
LEGITIMATE


EVALUATED
                 10         1
  FRAUD
REALLY     REALLY
             LEGITIMATE   FRAUD



      90%
EVALUATED
LEGITIMATE
          WR ONG989         0



EVALUATED
                 10         1
  FRAUD
ANOMALY DETECTION
30




         22.5
Clicks




          15




          7.5




           0
                Date
SOFTWARE
ARCHITECTURE
                           Alarm

               Detector

                          No Alarm
                Buffer


User Clicks    Landing
    Ad          Page
DETECTOR
              statistics

 Expected
  Clicks
              Threshold    Data Buffer
Sensitivity



               Alarm
average.php
function detect($sen) {
  $window = array(); $i = 0;
  $alarmCount = 0; $dtd = 0;
  $avg = $stddev = 0;
  $fraud = fopen("fraudclicks.csv", 'r');
  while($d = fgetcsv($fraud)) {
    $i++;
    if(count($window) > 7) {
      array_shift($window);
      $avg = array_sum($window) / 7;
      foreach($window as $val) {
        $stddev += pow($val - $average, 2);
      }
      $stddev = sqrt($stddev/7);
0.2




0.15




 0.1




0.05




  0
       1   2   3   4   5   6   7   8   9   10 11 12 13 14 15 16 17 18 19 20
if($d[1] > ($avg + ($sen * $stddev))){
          $alarmCount++;
          if($i > 201) {
            break;
          }
        } else {
          if($i > 201) {
            $dtd++;
          }
        }
      }
      array_push($window, $d[1]);
    }
    return array($alarmCount-1, $dtd);
}
1.6 SENSITIVITY
          30
                18 False Alarms          1 Day To Detect

         22.5
Clicks




          15




          7.5




           0
                                  Date
2.7 SENSITIVITY
          30
                1 False Alarm      18 Days To Detect

         22.5
Clicks




          15




          7.5




           0
                                Date
SICKNESS
AVAILABILITY
function detect($sens) {          sickavail.php
  $i = 0; $alarms = 0; $dtd = 0;
  $window = array(); $avail = array();
  $fraud = fopen("fraudclicks.csv", 'r');
  while($dat = fgetcsv($fraud)) {
    $dow = date("w", strtotime($dat[0]));
    if( count($window) >= 7
        && isset($avail[$dow]) ) {

      $sick = 0;
      foreach($window as $day => $value) {
        $dowavg = array_sum($avail[$day]) /
                  count($avail[$day]);
        $sick += $value / $dowavg;
      }
      $sick /= count($window);
$avlblty = array_sum($avail[$dow]) /
           count($avail[$dow]);
  $est = $sick * $avlblty;

  $fac = fac($dat[1]);
  $p = exp(-$est) * pow($est,$dat[1])
       / $fac; // poisson calc

  if($p < $sens && $dat[1] > $est) {
    $alarms++;
    if($i > 201) { break; }
  } else {
    if($i > 201) { $dtd++; }
  }

} // end if
0.2




0.15




 0.1




0.05




  0
       1   2   3   4   5   6   7   8   9   10
0.011 SENSITIVITY
          30
                1 False Alarm          1 Day To Detect

         22.5
Clicks




          15




          7.5




           0
                                Date
SUPERVISED CLASSIFIERS
classification model
SOFTWARE
ARCHITECTURE
                               Fraud

            Classifier

                             Not Fraud
  User     Transaction
Purchase    Processor


           Transaction
                              Learner
            Database
EVALUATING THE CLASSIFIER

Training Data   Learner      Model




 Test Data
                            Prediction
                Classifier   Accuracy
   Model
20




15




10




5




0
     0   5   10   15   20
20




15




10




5
             ?
0
     0   5       10   15   20
20




15




10




5
             ?
0
     0   5       10   15   20
$docs = array(
 array('fraud' => false, 'price' => 1699,
       'desc'=>'toy ninja', 'ship' => 'US'),
 array('fraud' => false, 'price' => 20000,
       'desc' => 'TV','ship' => 'US'),
 array('fraud' => false, 'price' => 2500,
       'desc' => 'cds', 'ship' => 'US'),
 array('fraud' => true, 'price' => 20000,
       'desc' => 'console', 'ship' => 'CN'),
 array('fraud' => true, 'price' => 5000,
       'desc' => 'books', 'ship' => 'US'),
 array('fraud' => true, 'price' => 15000,
       'desc' => 'ipod', 'ship' => 'CN'),
);
$db   = new XapianWritableDatabase("index",
                Xapian::DB_CREATE_OR_OPEN);
$idx = new XapianTermGenerator();
$stem = new XapianStem("english");
$idx->set_stemmer($stem);

foreach($docs as $key => $doc) {
    $xdoc = new XapianDocument();
    $xdoc->set_data($doc['fraud'] ?
                    "fraud" : "clean");
    $idx->set_document($xdoc);
    $idx->index_text($doc['price'] . ' ' .
         $doc['desc'] . ' ' . $doc['ship']);
    $db->add_document($xdoc, $key);
}
$db = null;
                               frau dknn.php
$test = array(                     testknn.ph
                                              p
   'price' => 10000, 'desc' => 'TV',
   'ship' => 'CN'
);

$db   = new XapianWritableDatabase("index",
         Xapian::DB_CREATE_OR_OPEN);
$idx = new XapianTermGenerator();
$stem = new XapianStem("english");
$idx->set_stemmer($stem);

$xdoc = new XapianDocument();
$idx->set_document($xdoc);
$idx->index_text($test['price'] . ' ' .
      $test['desc'] . ' ' . $test['ship']);
$id = $db->add_document($xdoc);
$enq = new XapianEnquire($db);
$rset = new XapianRSet();
$rset->add_document($id);
$eset = $enq->get_eset(10, $rset);
$terms = array();
$i = $eset->begin();
while ( !$i->equals($eset->end()) ) {
  $terms[] = $i->get_term(); $i->next();
}

$q = new XapianQuery(
         XapianQuery::OP_OR, $terms);
$enq->set_query($q);
$matches = $enq->get_mset(0, 4, $rset);
$i = $matches->begin();
while (!$i->equals($matches->end())) {
  if($i->get_document()->get_docid() != $id)
  {
    $class = $i->get_document()->get_data();
    var_dump($class);
  }
  $i->next();
}
$db->delete_document($id);


$ php testknn.php
string(5) "clean"
string(5) "fraud"
string(5) "fraud"
TRANSACTION
PARAMETERS
function compareEmailToName($name, $email) {
  $name = strtolower($name);
  $email = strtolower($email);
  $parts = explode(" ", $name);
  $pcnt = 0;

  list($user, $dom) = explode("@", $email);
  $user = str_replace(
              array(".", "+"), " ", $user);
  $dom = preg_replace("/..*/", "", $dom);

  similar_text($name, $user, $pcnt);
  if($pcnt > 80) { return 1.0; }
  similar_text($name, $dom, $pcnt);
  if($pcnt > 80) { return 0.8; }
                                 email.php
if(count($parts)) {
       $highest = 0;
       foreach($parts as $part) {
         similar_text($user, $part, $pcnt);
         if($pcnt > 50 && $pcnt > $highest) {
           $highest = $percent;
         }
         similar_text($dom, $part, $pcnt);
         if($pcnt > 50 && $pcnt > $highest) {
            $highest = $percent;
         }
       }
       return (1.7 * ($highest/100)) - 1;
     }

     return -1;
}
$data = array(
  'purchase_value' => 20993,
  'geo_country' => 'DE',
  'previous_orders' => 1,
  'time' => 6,
  'timegap' => 146632,
  'product_category' => 'small_item',
  'delivery_matches_card' => 0,
  'geo_ip_matches_card' => 1,
  'difference_from_last_trans' => 8755,
  'free_shipping' => 0,
  'email_like_name' => 0,
  'free_email_provider' => 0,
  'disposable_email_provider' => 0,
  'quantity' => 2,
  'fraud' => 0);
SUPPORT
VECTOR MACHINES
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
$ apt-get install libsvm-dev
$ apt-get install libsvm-tools

$ yum install libsvm-devel

$ pecl install svm-beta
$ echo extension=svm.so > /etc/php.d/svm.ini
$ php -r '$s = new svm(); $m = $s->train
(array(array(-1, -1), array(1, 1))); echo
$m->predict(array(0, -1));'
-1
$fh = fopen('paydata.csv', 'r');
$output = array();

while($data = fgetcsv($fh)) {
  $output[] = array(
     $data[14] == 1 ? -1 : 1,
     1 => ($data[0]/20000.00) - 1.0, // price
     2 => $data[1] == 'CN' ? 1.0:-1.0,
     3 => $data[1] == 'US' ? 1.0:-1.0,
     4 => $data[5] == 'digital' ? 1.0:-1.0,
     5 => $data[7] == 1 ? 1.0:-1.0, //geo
     6 => $data[6] == 1 ? 1.0:-1.0, // deliv
     12 => $data[9] == 1 ? 1.0:-1.0, // ship
     13 => ($data[13] / 1.5) - 1.0, // qty
  );
}                                learn.php
$svm = new svm();
$model = $svm->train($output,
               array(-1 => 0.65, 1 => 0.5));
$model->save('learn.model');

$fp = $tp = $fn = $tn = 0;
foreach($output as $test) {
  $res = $model->predict($test);
  if($test[0] > 0) {
    if($res > 0) { $tp++; }
    else { $fn++; }
  } else {
    if($res > 0) { $fp++; }
    else { $tn++; }
  }
}
// ...snip.. loading test data from
// paytest.csv

$model = new SVMModel('learn.model');

$fp = $tp = $fn = $tn = 0;
foreach($output as $test) {
  $res = $model->predict($test);
  if($test[0] > 0) {
    if($res > 0) { $tp++; }
    else { $fn++; }
  } else {
    if($res > 0) { $fp++; }
    else { $tn++; }
  }
}
                                   test.php
var_dump("True Positive " . $tp);
var_dump("True Negative " . $tn);
var_dump("False Positive " . $fp);
var_dump("False Negative " . $fn);
var_dump("Accuracy " .
        (($tp+$tn)/($tp+$tn+$fp+$fn)));
$ php learn.php
string(18) "True Positive 8316"
string(18) "True Negative 1682"
string(16) "False Positive 2"
string(16) "False Negative 0"
string(15) "Accuracy 0.9998"

$ php test.php
string(17) "True Positive 844"
string(17) "True Negative 155"
string(16) "False Positive 0"
string(16) "False Negative 1"
string(14) "Accuracy 0.999"
training data


  Test         Verify       Update



Automated     Manual        Manual
Time Series           Class Based



   Sensitivity             Model



 False    Days To    False        False
Alarms    Detect    Positives   Negatives
(shogun)
TEACHING YOUR
MACHINE
TO FIND
FRAUDSTERS

http://joind.in/3429

Ian Barber
ianb@php.net
Title Slide - CSI
http://www.flickr.com/photos/39matt/5241862082
Sickness Availability - Chicago Fire Department
http://www.flickr.com/photos/mike_miley/3929146730/
Model Buildings - Ah Ain’t Long For This Whorl
http://www.flickr.com/photos/chadmiller/98014022/
Repeat Customer - McDonald’s Loyalty Card
http://www.flickr.com/photos/fsse-info/3658873057/
Shipping - FedEx Truck
http://www.flickr.com/photos/moto_club4ag/4852235145/
Velocity - Chevrolet Chevelle Dragster
http://www.flickr.com/photos/jns001/2958999006/
GeoIP - Earth Asia Terminator View
http://www.flickr.com/photos/flyingsinger/86898564/
Multiple Items - Boxes
http://www.flickr.com/photos/skrewtape/851672959/

Más contenido relacionado

La actualidad más candente

News of the Symfony2 World
News of the Symfony2 WorldNews of the Symfony2 World
News of the Symfony2 World
Fabien Potencier
 
Advanced modulinos
Advanced modulinosAdvanced modulinos
Advanced modulinos
brian d foy
 
20 modules i haven't yet talked about
20 modules i haven't yet talked about20 modules i haven't yet talked about
20 modules i haven't yet talked about
Tatsuhiko Miyagawa
 
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 TokyoIntroduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Masahiro Nagano
 

La actualidad más candente (18)

zinno
zinnozinno
zinno
 
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
 
Introdução ao Perl 6
Introdução ao Perl 6Introdução ao Perl 6
Introdução ao Perl 6
 
News of the Symfony2 World
News of the Symfony2 WorldNews of the Symfony2 World
News of the Symfony2 World
 
C99
C99C99
C99
 
Php 101: PDO
Php 101: PDOPhp 101: PDO
Php 101: PDO
 
Advanced modulinos
Advanced modulinosAdvanced modulinos
Advanced modulinos
 
Xlab #1: Advantages of functional programming in Java 8
Xlab #1: Advantages of functional programming in Java 8Xlab #1: Advantages of functional programming in Java 8
Xlab #1: Advantages of functional programming in Java 8
 
The Magic Of Tie
The Magic Of TieThe Magic Of Tie
The Magic Of Tie
 
C99[2]
C99[2]C99[2]
C99[2]
 
Créer une base NoSQL en 1 heure
Créer une base NoSQL en 1 heureCréer une base NoSQL en 1 heure
Créer une base NoSQL en 1 heure
 
Advanced modulinos trial
Advanced modulinos trialAdvanced modulinos trial
Advanced modulinos trial
 
Cod
CodCod
Cod
 
20 modules i haven't yet talked about
20 modules i haven't yet talked about20 modules i haven't yet talked about
20 modules i haven't yet talked about
 
Melhorando sua API com DSLs
Melhorando sua API com DSLsMelhorando sua API com DSLs
Melhorando sua API com DSLs
 
Perl 6 by example
Perl 6 by examplePerl 6 by example
Perl 6 by example
 
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 TokyoIntroduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
 
PHP Language Trivia
PHP Language TriviaPHP Language Trivia
PHP Language Trivia
 

Destacado

Deloitte-2014-Technology-Fast500
Deloitte-2014-Technology-Fast500Deloitte-2014-Technology-Fast500
Deloitte-2014-Technology-Fast500
Seth Greenberg
 
dollar general annual reports 2002
dollar general annual reports 2002dollar general annual reports 2002
dollar general annual reports 2002
finance41
 
Arc Sight Info Documents 12 3 2009
Arc Sight Info Documents 12 3 2009Arc Sight Info Documents 12 3 2009
Arc Sight Info Documents 12 3 2009
mattdriscoll
 
Israel pide un rey
Israel pide un reyIsrael pide un rey
Israel pide un rey
Coke Neto
 
Technology-Fast-500-Winners-Brochure.PDF
Technology-Fast-500-Winners-Brochure.PDFTechnology-Fast-500-Winners-Brochure.PDF
Technology-Fast-500-Winners-Brochure.PDF
Justin Campbell
 

Destacado (16)

Deloittes 2009 Technology Fast 500™ Ranking
Deloittes 2009 Technology Fast 500™  RankingDeloittes 2009 Technology Fast 500™  Ranking
Deloittes 2009 Technology Fast 500™ Ranking
 
Canada Deber 2pdf
Canada Deber 2pdfCanada Deber 2pdf
Canada Deber 2pdf
 
Deloitte-2014-Technology-Fast500
Deloitte-2014-Technology-Fast500Deloitte-2014-Technology-Fast500
Deloitte-2014-Technology-Fast500
 
dollar general annual reports 2002
dollar general annual reports 2002dollar general annual reports 2002
dollar general annual reports 2002
 
Deployment Tactics
Deployment TacticsDeployment Tactics
Deployment Tactics
 
20140528 valeant story draft deckv85
20140528 valeant story draft deckv8520140528 valeant story draft deckv85
20140528 valeant story draft deckv85
 
Arc Sight Info Documents 10 21 2009
Arc Sight Info Documents 10 21 2009Arc Sight Info Documents 10 21 2009
Arc Sight Info Documents 10 21 2009
 
The Pixel Lab 2015 | Don't lose heart - Sean Coleman
The Pixel Lab 2015 | Don't lose heart - Sean Coleman The Pixel Lab 2015 | Don't lose heart - Sean Coleman
The Pixel Lab 2015 | Don't lose heart - Sean Coleman
 
Arc Sight Info Documents 12 3 2009
Arc Sight Info Documents 12 3 2009Arc Sight Info Documents 12 3 2009
Arc Sight Info Documents 12 3 2009
 
Document Classification In PHP - Slight Return
Document Classification In PHP - Slight ReturnDocument Classification In PHP - Slight Return
Document Classification In PHP - Slight Return
 
ZeroMQ Is The Answer: PHP Tek 11 Version
ZeroMQ Is The Answer: PHP Tek 11 VersionZeroMQ Is The Answer: PHP Tek 11 Version
ZeroMQ Is The Answer: PHP Tek 11 Version
 
Social media & dirigeants du Cac 40 : que disent les conversations ?
Social media & dirigeants du Cac 40 : que disent les conversations ?Social media & dirigeants du Cac 40 : que disent les conversations ?
Social media & dirigeants du Cac 40 : que disent les conversations ?
 
Eca´s probabilidad y estadística Agosto 2012-Enero 2013
Eca´s probabilidad y estadística Agosto 2012-Enero 2013Eca´s probabilidad y estadística Agosto 2012-Enero 2013
Eca´s probabilidad y estadística Agosto 2012-Enero 2013
 
4 de febrero de 1992 pdf
4 de febrero de 1992 pdf4 de febrero de 1992 pdf
4 de febrero de 1992 pdf
 
Israel pide un rey
Israel pide un reyIsrael pide un rey
Israel pide un rey
 
Technology-Fast-500-Winners-Brochure.PDF
Technology-Fast-500-Winners-Brochure.PDFTechnology-Fast-500-Winners-Brochure.PDF
Technology-Fast-500-Winners-Brochure.PDF
 

Similar a Teaching Your Machine To Find Fraudsters

Crazy things done on PHP
Crazy things done on PHPCrazy things done on PHP
Crazy things done on PHP
Taras Kalapun
 
Mocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnitMocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnit
mfrost503
 
R57shell
R57shellR57shell
R57shell
ady36
 

Similar a Teaching Your Machine To Find Fraudsters (20)

Javascript & jQuery: A pragmatic introduction
Javascript & jQuery: A pragmatic introductionJavascript & jQuery: A pragmatic introduction
Javascript & jQuery: A pragmatic introduction
 
Crazy things done on PHP
Crazy things done on PHPCrazy things done on PHP
Crazy things done on PHP
 
Coding website
Coding websiteCoding website
Coding website
 
Rails-like JavaScript Using CoffeeScript, Backbone.js and Jasmine
Rails-like JavaScript Using CoffeeScript, Backbone.js and JasmineRails-like JavaScript Using CoffeeScript, Backbone.js and Jasmine
Rails-like JavaScript Using CoffeeScript, Backbone.js and Jasmine
 
Your code sucks, let's fix it - DPC UnCon
Your code sucks, let's fix it - DPC UnConYour code sucks, let's fix it - DPC UnCon
Your code sucks, let's fix it - DPC UnCon
 
TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012
TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012
TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012
 
Mocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnitMocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnit
 
My Development Story
My Development StoryMy Development Story
My Development Story
 
Unit testing with zend framework tek11
Unit testing with zend framework tek11Unit testing with zend framework tek11
Unit testing with zend framework tek11
 
Mocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnitMocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnit
 
Mocking Demystified
Mocking DemystifiedMocking Demystified
Mocking Demystified
 
Ns2programs
Ns2programsNs2programs
Ns2programs
 
Unit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBeneluxUnit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBenelux
 
Document Classification In PHP
Document Classification In PHPDocument Classification In PHP
Document Classification In PHP
 
Gta v savegame
Gta v savegameGta v savegame
Gta v savegame
 
WordPress Realtime - WordCamp São Paulo 2015
WordPress Realtime - WordCamp São Paulo 2015WordPress Realtime - WordCamp São Paulo 2015
WordPress Realtime - WordCamp São Paulo 2015
 
Database api
Database apiDatabase api
Database api
 
ddd+scala
ddd+scaladdd+scala
ddd+scala
 
Game Development with SDL and Perl
Game Development with SDL and PerlGame Development with SDL and Perl
Game Development with SDL and Perl
 
R57shell
R57shellR57shell
R57shell
 

Último

IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI Solutions
Enterprise Knowledge
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
Joaquim Jorge
 

Último (20)

2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivity
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organization
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)
 
🐬 The future of MySQL is Postgres 🐘
🐬  The future of MySQL is Postgres   🐘🐬  The future of MySQL is Postgres   🐘
🐬 The future of MySQL is Postgres 🐘
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI Solutions
 
Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Script
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024
 
Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
 
Slack Application Development 101 Slides
Slack Application Development 101 SlidesSlack Application Development 101 Slides
Slack Application Development 101 Slides
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonets
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf
 

Teaching Your Machine To Find Fraudsters

  • 1. TEACHING YOUR MACHINE TO FIND FRAUDSTERS Ian Barber ianb@php.net phpir.com twitter.com/ianbarber
  • 3. 5% 3% SOME .1% SMALL NUMBERS 8%
  • 5. REALLY REALLY LEGITIMATE FRAUD EVALUATED 989 0 LEGITIMATE EVALUATED 10 1 FRAUD
  • 6. REALLY REALLY LEGITIMATE FRAUD 90% EVALUATED LEGITIMATE WR ONG989 0 EVALUATED 10 1 FRAUD
  • 8. 30 22.5 Clicks 15 7.5 0 Date
  • 9. SOFTWARE ARCHITECTURE Alarm Detector No Alarm Buffer User Clicks Landing Ad Page
  • 10. DETECTOR statistics Expected Clicks Threshold Data Buffer Sensitivity Alarm
  • 11. average.php function detect($sen) { $window = array(); $i = 0; $alarmCount = 0; $dtd = 0; $avg = $stddev = 0; $fraud = fopen("fraudclicks.csv", 'r'); while($d = fgetcsv($fraud)) { $i++; if(count($window) > 7) { array_shift($window); $avg = array_sum($window) / 7; foreach($window as $val) { $stddev += pow($val - $average, 2); } $stddev = sqrt($stddev/7);
  • 12. 0.2 0.15 0.1 0.05 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
  • 13. if($d[1] > ($avg + ($sen * $stddev))){ $alarmCount++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } array_push($window, $d[1]); } return array($alarmCount-1, $dtd); }
  • 14. 1.6 SENSITIVITY 30 18 False Alarms 1 Day To Detect 22.5 Clicks 15 7.5 0 Date
  • 15. 2.7 SENSITIVITY 30 1 False Alarm 18 Days To Detect 22.5 Clicks 15 7.5 0 Date
  • 17. function detect($sens) { sickavail.php $i = 0; $alarms = 0; $dtd = 0; $window = array(); $avail = array(); $fraud = fopen("fraudclicks.csv", 'r'); while($dat = fgetcsv($fraud)) { $dow = date("w", strtotime($dat[0])); if( count($window) >= 7 && isset($avail[$dow]) ) { $sick = 0; foreach($window as $day => $value) { $dowavg = array_sum($avail[$day]) / count($avail[$day]); $sick += $value / $dowavg; } $sick /= count($window);
  • 18. $avlblty = array_sum($avail[$dow]) / count($avail[$dow]); $est = $sick * $avlblty; $fac = fac($dat[1]); $p = exp(-$est) * pow($est,$dat[1]) / $fac; // poisson calc if($p < $sens && $dat[1] > $est) { $alarms++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } // end if
  • 19. 0.2 0.15 0.1 0.05 0 1 2 3 4 5 6 7 8 9 10
  • 20. 0.011 SENSITIVITY 30 1 False Alarm 1 Day To Detect 22.5 Clicks 15 7.5 0 Date
  • 22. classification model SOFTWARE ARCHITECTURE Fraud Classifier Not Fraud User Transaction Purchase Processor Transaction Learner Database
  • 23. EVALUATING THE CLASSIFIER Training Data Learner Model Test Data Prediction Classifier Accuracy Model
  • 24. 20 15 10 5 0 0 5 10 15 20
  • 25. 20 15 10 5 ? 0 0 5 10 15 20
  • 26. 20 15 10 5 ? 0 0 5 10 15 20
  • 27. $docs = array( array('fraud' => false, 'price' => 1699, 'desc'=>'toy ninja', 'ship' => 'US'), array('fraud' => false, 'price' => 20000, 'desc' => 'TV','ship' => 'US'), array('fraud' => false, 'price' => 2500, 'desc' => 'cds', 'ship' => 'US'), array('fraud' => true, 'price' => 20000, 'desc' => 'console', 'ship' => 'CN'), array('fraud' => true, 'price' => 5000, 'desc' => 'books', 'ship' => 'US'), array('fraud' => true, 'price' => 15000, 'desc' => 'ipod', 'ship' => 'CN'), );
  • 28. $db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN); $idx = new XapianTermGenerator(); $stem = new XapianStem("english"); $idx->set_stemmer($stem); foreach($docs as $key => $doc) { $xdoc = new XapianDocument(); $xdoc->set_data($doc['fraud'] ? "fraud" : "clean"); $idx->set_document($xdoc); $idx->index_text($doc['price'] . ' ' . $doc['desc'] . ' ' . $doc['ship']); $db->add_document($xdoc, $key); } $db = null; frau dknn.php
  • 29. $test = array( testknn.ph p 'price' => 10000, 'desc' => 'TV', 'ship' => 'CN' ); $db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN); $idx = new XapianTermGenerator(); $stem = new XapianStem("english"); $idx->set_stemmer($stem); $xdoc = new XapianDocument(); $idx->set_document($xdoc); $idx->index_text($test['price'] . ' ' . $test['desc'] . ' ' . $test['ship']); $id = $db->add_document($xdoc);
  • 30. $enq = new XapianEnquire($db); $rset = new XapianRSet(); $rset->add_document($id); $eset = $enq->get_eset(10, $rset); $terms = array(); $i = $eset->begin(); while ( !$i->equals($eset->end()) ) { $terms[] = $i->get_term(); $i->next(); } $q = new XapianQuery( XapianQuery::OP_OR, $terms); $enq->set_query($q); $matches = $enq->get_mset(0, 4, $rset);
  • 31. $i = $matches->begin(); while (!$i->equals($matches->end())) { if($i->get_document()->get_docid() != $id) { $class = $i->get_document()->get_data(); var_dump($class); } $i->next(); } $db->delete_document($id); $ php testknn.php string(5) "clean" string(5) "fraud" string(5) "fraud"
  • 33.
  • 34.
  • 35. function compareEmailToName($name, $email) { $name = strtolower($name); $email = strtolower($email); $parts = explode(" ", $name); $pcnt = 0; list($user, $dom) = explode("@", $email); $user = str_replace( array(".", "+"), " ", $user); $dom = preg_replace("/..*/", "", $dom); similar_text($name, $user, $pcnt); if($pcnt > 80) { return 1.0; } similar_text($name, $dom, $pcnt); if($pcnt > 80) { return 0.8; } email.php
  • 36. if(count($parts)) { $highest = 0; foreach($parts as $part) { similar_text($user, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } similar_text($dom, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } } return (1.7 * ($highest/100)) - 1; } return -1; }
  • 37.
  • 38.
  • 39.
  • 40. $data = array( 'purchase_value' => 20993, 'geo_country' => 'DE', 'previous_orders' => 1, 'time' => 6, 'timegap' => 146632, 'product_category' => 'small_item', 'delivery_matches_card' => 0, 'geo_ip_matches_card' => 1, 'difference_from_last_trans' => 8755, 'free_shipping' => 0, 'email_like_name' => 0, 'free_email_provider' => 0, 'disposable_email_provider' => 0, 'quantity' => 2, 'fraud' => 0);
  • 42. 20 15 10 5 0 0 5 10 15 20
  • 43. 20 15 10 5 0 0 5 10 15 20
  • 44. 20 15 10 5 0 0 5 10 15 20
  • 45. 20 15 10 5 0 0 5 10 15 20
  • 46. 20 15 10 5 0 0 5 10 15 20
  • 47. $ apt-get install libsvm-dev $ apt-get install libsvm-tools $ yum install libsvm-devel $ pecl install svm-beta $ echo extension=svm.so > /etc/php.d/svm.ini $ php -r '$s = new svm(); $m = $s->train (array(array(-1, -1), array(1, 1))); echo $m->predict(array(0, -1));' -1
  • 48. $fh = fopen('paydata.csv', 'r'); $output = array(); while($data = fgetcsv($fh)) { $output[] = array( $data[14] == 1 ? -1 : 1, 1 => ($data[0]/20000.00) - 1.0, // price 2 => $data[1] == 'CN' ? 1.0:-1.0, 3 => $data[1] == 'US' ? 1.0:-1.0, 4 => $data[5] == 'digital' ? 1.0:-1.0, 5 => $data[7] == 1 ? 1.0:-1.0, //geo 6 => $data[6] == 1 ? 1.0:-1.0, // deliv 12 => $data[9] == 1 ? 1.0:-1.0, // ship 13 => ($data[13] / 1.5) - 1.0, // qty ); } learn.php
  • 49. $svm = new svm(); $model = $svm->train($output, array(-1 => 0.65, 1 => 0.5)); $model->save('learn.model'); $fp = $tp = $fn = $tn = 0; foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } } }
  • 50. // ...snip.. loading test data from // paytest.csv $model = new SVMModel('learn.model'); $fp = $tp = $fn = $tn = 0; foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } } } test.php
  • 51. var_dump("True Positive " . $tp); var_dump("True Negative " . $tn); var_dump("False Positive " . $fp); var_dump("False Negative " . $fn); var_dump("Accuracy " . (($tp+$tn)/($tp+$tn+$fp+$fn)));
  • 52. $ php learn.php string(18) "True Positive 8316" string(18) "True Negative 1682" string(16) "False Positive 2" string(16) "False Negative 0" string(15) "Accuracy 0.9998" $ php test.php string(17) "True Positive 844" string(17) "True Negative 155" string(16) "False Positive 0" string(16) "False Negative 1" string(14) "Accuracy 0.999"
  • 53. training data Test Verify Update Automated Manual Manual
  • 54. Time Series Class Based Sensitivity Model False Days To False False Alarms Detect Positives Negatives
  • 57. Title Slide - CSI http://www.flickr.com/photos/39matt/5241862082 Sickness Availability - Chicago Fire Department http://www.flickr.com/photos/mike_miley/3929146730/ Model Buildings - Ah Ain’t Long For This Whorl http://www.flickr.com/photos/chadmiller/98014022/ Repeat Customer - McDonald’s Loyalty Card http://www.flickr.com/photos/fsse-info/3658873057/ Shipping - FedEx Truck http://www.flickr.com/photos/moto_club4ag/4852235145/ Velocity - Chevrolet Chevelle Dragster http://www.flickr.com/photos/jns001/2958999006/ GeoIP - Earth Asia Terminator View http://www.flickr.com/photos/flyingsinger/86898564/ Multiple Items - Boxes http://www.flickr.com/photos/skrewtape/851672959/