Más contenido relacionado La actualidad más candente (18) Similar a Teaching Your Machine To Find Fraudsters (20) Teaching Your Machine To Find Fraudsters3. 5%
3%
SOME .1%
SMALL
NUMBERS 8%
5. REALLY REALLY
LEGITIMATE FRAUD
EVALUATED
989 0
LEGITIMATE
EVALUATED
10 1
FRAUD
6. REALLY REALLY
LEGITIMATE FRAUD
90%
EVALUATED
LEGITIMATE
WR ONG989 0
EVALUATED
10 1
FRAUD
8. 30
22.5
Clicks
15
7.5
0
Date
10. DETECTOR
statistics
Expected
Clicks
Threshold Data Buffer
Sensitivity
Alarm
11. average.php
function detect($sen) {
$window = array(); $i = 0;
$alarmCount = 0; $dtd = 0;
$avg = $stddev = 0;
$fraud = fopen("fraudclicks.csv", 'r');
while($d = fgetcsv($fraud)) {
$i++;
if(count($window) > 7) {
array_shift($window);
$avg = array_sum($window) / 7;
foreach($window as $val) {
$stddev += pow($val - $average, 2);
}
$stddev = sqrt($stddev/7);
13. if($d[1] > ($avg + ($sen * $stddev))){
$alarmCount++;
if($i > 201) {
break;
}
} else {
if($i > 201) {
$dtd++;
}
}
}
array_push($window, $d[1]);
}
return array($alarmCount-1, $dtd);
}
14. 1.6 SENSITIVITY
30
18 False Alarms 1 Day To Detect
22.5
Clicks
15
7.5
0
Date
15. 2.7 SENSITIVITY
30
1 False Alarm 18 Days To Detect
22.5
Clicks
15
7.5
0
Date
17. function detect($sens) { sickavail.php
$i = 0; $alarms = 0; $dtd = 0;
$window = array(); $avail = array();
$fraud = fopen("fraudclicks.csv", 'r');
while($dat = fgetcsv($fraud)) {
$dow = date("w", strtotime($dat[0]));
if( count($window) >= 7
&& isset($avail[$dow]) ) {
$sick = 0;
foreach($window as $day => $value) {
$dowavg = array_sum($avail[$day]) /
count($avail[$day]);
$sick += $value / $dowavg;
}
$sick /= count($window);
18. $avlblty = array_sum($avail[$dow]) /
count($avail[$dow]);
$est = $sick * $avlblty;
$fac = fac($dat[1]);
$p = exp(-$est) * pow($est,$dat[1])
/ $fac; // poisson calc
if($p < $sens && $dat[1] > $est) {
$alarms++;
if($i > 201) { break; }
} else {
if($i > 201) { $dtd++; }
}
} // end if
27. $docs = array(
array('fraud' => false, 'price' => 1699,
'desc'=>'toy ninja', 'ship' => 'US'),
array('fraud' => false, 'price' => 20000,
'desc' => 'TV','ship' => 'US'),
array('fraud' => false, 'price' => 2500,
'desc' => 'cds', 'ship' => 'US'),
array('fraud' => true, 'price' => 20000,
'desc' => 'console', 'ship' => 'CN'),
array('fraud' => true, 'price' => 5000,
'desc' => 'books', 'ship' => 'US'),
array('fraud' => true, 'price' => 15000,
'desc' => 'ipod', 'ship' => 'CN'),
);
28. $db = new XapianWritableDatabase("index",
Xapian::DB_CREATE_OR_OPEN);
$idx = new XapianTermGenerator();
$stem = new XapianStem("english");
$idx->set_stemmer($stem);
foreach($docs as $key => $doc) {
$xdoc = new XapianDocument();
$xdoc->set_data($doc['fraud'] ?
"fraud" : "clean");
$idx->set_document($xdoc);
$idx->index_text($doc['price'] . ' ' .
$doc['desc'] . ' ' . $doc['ship']);
$db->add_document($xdoc, $key);
}
$db = null;
frau dknn.php
29. $test = array( testknn.ph
p
'price' => 10000, 'desc' => 'TV',
'ship' => 'CN'
);
$db = new XapianWritableDatabase("index",
Xapian::DB_CREATE_OR_OPEN);
$idx = new XapianTermGenerator();
$stem = new XapianStem("english");
$idx->set_stemmer($stem);
$xdoc = new XapianDocument();
$idx->set_document($xdoc);
$idx->index_text($test['price'] . ' ' .
$test['desc'] . ' ' . $test['ship']);
$id = $db->add_document($xdoc);
30. $enq = new XapianEnquire($db);
$rset = new XapianRSet();
$rset->add_document($id);
$eset = $enq->get_eset(10, $rset);
$terms = array();
$i = $eset->begin();
while ( !$i->equals($eset->end()) ) {
$terms[] = $i->get_term(); $i->next();
}
$q = new XapianQuery(
XapianQuery::OP_OR, $terms);
$enq->set_query($q);
$matches = $enq->get_mset(0, 4, $rset);
31. $i = $matches->begin();
while (!$i->equals($matches->end())) {
if($i->get_document()->get_docid() != $id)
{
$class = $i->get_document()->get_data();
var_dump($class);
}
$i->next();
}
$db->delete_document($id);
$ php testknn.php
string(5) "clean"
string(5) "fraud"
string(5) "fraud"
35. function compareEmailToName($name, $email) {
$name = strtolower($name);
$email = strtolower($email);
$parts = explode(" ", $name);
$pcnt = 0;
list($user, $dom) = explode("@", $email);
$user = str_replace(
array(".", "+"), " ", $user);
$dom = preg_replace("/..*/", "", $dom);
similar_text($name, $user, $pcnt);
if($pcnt > 80) { return 1.0; }
similar_text($name, $dom, $pcnt);
if($pcnt > 80) { return 0.8; }
email.php
36. if(count($parts)) {
$highest = 0;
foreach($parts as $part) {
similar_text($user, $part, $pcnt);
if($pcnt > 50 && $pcnt > $highest) {
$highest = $percent;
}
similar_text($dom, $part, $pcnt);
if($pcnt > 50 && $pcnt > $highest) {
$highest = $percent;
}
}
return (1.7 * ($highest/100)) - 1;
}
return -1;
}
40. $data = array(
'purchase_value' => 20993,
'geo_country' => 'DE',
'previous_orders' => 1,
'time' => 6,
'timegap' => 146632,
'product_category' => 'small_item',
'delivery_matches_card' => 0,
'geo_ip_matches_card' => 1,
'difference_from_last_trans' => 8755,
'free_shipping' => 0,
'email_like_name' => 0,
'free_email_provider' => 0,
'disposable_email_provider' => 0,
'quantity' => 2,
'fraud' => 0);
47. $ apt-get install libsvm-dev
$ apt-get install libsvm-tools
$ yum install libsvm-devel
$ pecl install svm-beta
$ echo extension=svm.so > /etc/php.d/svm.ini
$ php -r '$s = new svm(); $m = $s->train
(array(array(-1, -1), array(1, 1))); echo
$m->predict(array(0, -1));'
-1
48. $fh = fopen('paydata.csv', 'r');
$output = array();
while($data = fgetcsv($fh)) {
$output[] = array(
$data[14] == 1 ? -1 : 1,
1 => ($data[0]/20000.00) - 1.0, // price
2 => $data[1] == 'CN' ? 1.0:-1.0,
3 => $data[1] == 'US' ? 1.0:-1.0,
4 => $data[5] == 'digital' ? 1.0:-1.0,
5 => $data[7] == 1 ? 1.0:-1.0, //geo
6 => $data[6] == 1 ? 1.0:-1.0, // deliv
12 => $data[9] == 1 ? 1.0:-1.0, // ship
13 => ($data[13] / 1.5) - 1.0, // qty
);
} learn.php
49. $svm = new svm();
$model = $svm->train($output,
array(-1 => 0.65, 1 => 0.5));
$model->save('learn.model');
$fp = $tp = $fn = $tn = 0;
foreach($output as $test) {
$res = $model->predict($test);
if($test[0] > 0) {
if($res > 0) { $tp++; }
else { $fn++; }
} else {
if($res > 0) { $fp++; }
else { $tn++; }
}
}
50. // ...snip.. loading test data from
// paytest.csv
$model = new SVMModel('learn.model');
$fp = $tp = $fn = $tn = 0;
foreach($output as $test) {
$res = $model->predict($test);
if($test[0] > 0) {
if($res > 0) { $tp++; }
else { $fn++; }
} else {
if($res > 0) { $fp++; }
else { $tn++; }
}
}
test.php
51. var_dump("True Positive " . $tp);
var_dump("True Negative " . $tn);
var_dump("False Positive " . $fp);
var_dump("False Negative " . $fn);
var_dump("Accuracy " .
(($tp+$tn)/($tp+$tn+$fp+$fn)));
52. $ php learn.php
string(18) "True Positive 8316"
string(18) "True Negative 1682"
string(16) "False Positive 2"
string(16) "False Negative 0"
string(15) "Accuracy 0.9998"
$ php test.php
string(17) "True Positive 844"
string(17) "True Negative 155"
string(16) "False Positive 0"
string(16) "False Negative 1"
string(14) "Accuracy 0.999"
54. Time Series Class Based
Sensitivity Model
False Days To False False
Alarms Detect Positives Negatives
57. Title Slide - CSI
http://www.flickr.com/photos/39matt/5241862082
Sickness Availability - Chicago Fire Department
http://www.flickr.com/photos/mike_miley/3929146730/
Model Buildings - Ah Ain’t Long For This Whorl
http://www.flickr.com/photos/chadmiller/98014022/
Repeat Customer - McDonald’s Loyalty Card
http://www.flickr.com/photos/fsse-info/3658873057/
Shipping - FedEx Truck
http://www.flickr.com/photos/moto_club4ag/4852235145/
Velocity - Chevrolet Chevelle Dragster
http://www.flickr.com/photos/jns001/2958999006/
GeoIP - Earth Asia Terminator View
http://www.flickr.com/photos/flyingsinger/86898564/
Multiple Items - Boxes
http://www.flickr.com/photos/skrewtape/851672959/