SlideShare a Scribd company logo
1 of 39
An analysis and comparison from a developer’s perspective
   Report Buyer product catalogue:

    • Text fields: title, subtitle, summary, toc
    • Product code and ISBN
    • Supplier, category, type and availability
    • Publication date and price
Enterprise class search engine
Scalable and based on Apache Lucene
REST-ful API or PECL extension
Fast, transactional full-text indexing
Faceted and geospatial search
Rich document indexing
Comes with simple web interface
Built-in caching of queries and responses
Numerous plug-ins
   Available as system packages
   Uses Tomcat or Jetty
   Requires a restart on configuration change
   Packages install as a service
   Specify database location
   Memory settings
   Query caching options
   Request handler setup
   Search components and plug-ins
   Spell checker configuration
<!-- Report Buyer fields -->
<field name="item_guid" type="string" indexed="true" stored="true" required="true" />
<field name="name" type="text" indexed="true" stored="true" required="true" boost="75"
     omitNorms="false" />
<field name="subtitle" type="text" indexed="true" stored="true" required="false" boost="25"
     omitNorms="false" />
<field name="summary" type="text" indexed="true" stored="false" boost="1" omitNorms="false" />
<field name="toc" type="text" indexed="true" stored="false" boost="1" omitNorms="false" />
<field name="isbn" type="string" indexed="true" stored="false" boost="200" omitNorms="false" />
<field name="product_code" type="string" indexed="true" stored="true" boost="200" omitNorms="false" />
<field name="publish_date" type="tdate" indexed="true" stored="true" />
<field name="price" type="tfloat" indexed="true" stored="true" />
<field name="availability" type="boolean" indexed="true" stored="true" />
<field name="link" type="string" indexed="false" stored="true" />
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>

<copyField source="name" dest="text"/>
<copyField source="subtitle" dest="text"/>
<copyField source="summary" dest="text"/>
<copyField source="toc" dest="text"/>

<uniqueKey>item_guid</uniqueKey>
<defaultSearchField>text</defaultSearchField>
   Data Import Handler
   REST-ful API
   PHP PECL Extension
   Third-party libraries, like Solarium
<?php
$solr_options = array('secure' => false, 'hostname' => 'localhost', 'port' => 8080);
$solr       = new SolrClient($solr_options);
$doc        = new SolrInputDocument();
while ($row = mysql_fetch_array($result, MYSQL_ASSOC))
{
    $doc = new SolrInputDocument();
    $row['publish_date'] = strftime('%Y-%m-%dT00:00:01Z', strtotime($row['publish_date']));
    foreach ($row as $key => $value) {
            $doc->addField($key, $value);
    }
    $updateResponse = $solr->addDocument($doc);
    $response = $updateResponse->getResponse();
    if ($response->responseHeader->status != 0) {
            print "Error importing into Solr: ";
print_r($response);
    }
}

$solr->commit();
?>
POST to http://localhost:8080/solr/update?commit=true

<add>
   <doc>
          <field name="item_guid">a34bbff9e17ada79658c72fde90c7369</field>
          <field name="name">Research Report on China's Corn Industry</field>
          <field name="price">1265</field>
          etc
    </doc>
</add>
$solr_options = array('secure' => false, 'hostname' => 'localhost', 'port' => 8080);
$solr = new SolrClient($solr_options);
$query = new SolrQuery();
$query->setQuery("research in china");
$query->setFacet(true);
$query->addFacetField('availability');

$query->addField('item_guid')->addField('name')->addField('publish_date')->addField('subtitle')->
   addField('product_code')->addField('availability')->addField('price');

$query->addSortField('publish_date', SolrQuery::ORDER_DESC);

$query_response = $solr->query($query);
$response = $query_response->getResponse();

print "Found ".$response->response->numFound." results, for {$query_string} in ".$response-
     >responseHeader->QTime." ms:nn";
foreach ($response->response->docs as $position=>$doc_data) {
     $download = ($doc_data['availability'] == '1') ? 'Yes' : 'No';
     print "{$position} - Date:{$pub_date} - {$doc_data['product_code']} - D/L:{$download} £".sprintf("%5d",
     $doc_data['price'])." - {$doc_data['name']}n";
}
print "Facets for instant ".$response->facet_counts->facet_fields->availability->false;
http://localhost:8080/solr/select/?q=research%20%in%20china&indent=on&hl=true&hl.fl=item_guid,name,
    publish_date,subtitle,product_code,availability,price&facet=true&facet.field=availability&wt=json

{
 "responseHeader":{
  "status":0, "QTime":20,
  "params":{
      "facet":"true",      "indent":"on",               "q":"research u0000 china",
      "hl.fl":"item_guid,name,publish_date,subtitle,product_code,availability,price",
      "facet.field":"availability", "wt":"json", "hl":"true"}},
 "response":{"numFound":197481,"start":0,"docs":[
      {
       "item_guid":"e68cf64921a02e926137d78d2c52da35",
       "name":"Market Research Report on China Civil Aero Industry",
       "product_code":"SFC00076",
       "price":190.0, "availability":false,
       "type":10,
      "link":
      "/industry_manufacturing/plant_heavy_equipment/market_research_report_china_civil_aero_industry.
      html",
       "publish_date":"2008-07-22T00:00:01Z"
      }
}
   More features than other products
   Responsive, busy mailing list
   Large team of developers
   Good PHP libraries for integration
   Several books available
   Fairly heavy footprint
   Also built on Apache Lucene
   JSON-based
   Distributed, scalable server model
   Easy to configure, or configuration free
   Faceting and highlight support
   Auto type detection
   Multiple indexes
   CouchDB integration
   Download and unpack zip file
   Run elasticsearch/bin/elasticsearch
   No schema is required - almost
   No configuration is required - almost
GET http://localhost:9200/ HTTP/1.0
{
     "ok" : true,
     "name" : "Test",
     "version" : {
       "number" : "0.18.7",
       "snapshot_build" : false
     },
     "tagline" : "You Know, for Search",
     "cover" : "DON'T PANIC",
     "quote" : {
       "book" : "The Hitchhiker's Guide to the Galaxy",
       "chapter" : "Chapter 27",
       "text1" : ""Forty-two," said Deep Thought, with infinite majesty and calm.",
       "text2" : ""The Answer to the Great Question, of Life, the Universe and Everything""
     }
   }
curl -XPUT http://localhost:9200/reports/ -d '
{
     "index:" {
           "analysis": {
                         "analyzer": {
                                       "my_analyzer": {
                                                  "tokenizer": "standard",
                                                  "filter": ["standard", "lowercase", "my_stemmer"]
                                       }
                         },
                         "filter": {
                                       "my_stemmer": {
                                                  "type": "stemmer",
                                                  "name": "english"
                                       }
                         }
           }
     }
}'
<?php
require_once("ElasticSearch.php");
$es = new ElasticSearch;
$es->index = 'reports';
$type = 'report';
$mappings = array($type => array('properties' => array(
           '_id' => array('type' => 'string', 'path' => 'item_guid'),
           'item_guid' => array('type' => 'string', 'store' => 'yes', 'index' => 'not_analyzed'),
           'name' => array('type' => 'string', 'store' => 'no', 'boost' => 75),
           'subtitle' => array('type' => 'string', 'store' => 'yes', 'boost' => 25),
           'summary' => array('type' => 'string', 'store' => 'yes', 'boost' => 10),
           'toc' => array('type' => 'string', 'store' => 'no'),
           'product_code' => array('type' => 'string', 'store' => 'yes', 'boost' => 200, 'index' => 'not_analyzed'),
           'isbn' => array('type' => 'string', 'store' => 'yes', 'boost' => 200, 'index' => 'not_analyzed'),
)));

$json = json_encode($mappings);

$es->map($type, $json);
?>
<?php
require_once("ElasticSearch.php");
$es = new ElasticSearch;
$es->index = 'reports';
$type = 'report';

$sql = "SELECT `item_guid`, `name`, `subtitle`, `summary`, `toc`, `supplier`,
           `product_code`, `isbn`, `category`, `price`, `availibility` as `availability`,
           `type`, `link`, `publish_date`
           FROM `rb_search`";

$result = read_query($sql);

while ($row = mysql_fetch_array($result, MYSQL_ASSOC))
{
    $es->add($type, $row['item_guid'], json_encode($row));
}
?>
GET http://localhost:9200/reports/report/_count/

{"count":260349,"_shards":{"total":1,"successful":1,"failed":0}}
<?php
require_once("ElasticSearch.php");
$es = new ElasticSearch;

$es->index = 'reports';
$type = 'report';

$query = array(
   'fields' => array('item_guid', 'name', 'subtitle'),
   'query' => array(
                          'term' => array('name' => 'research'),
                          ),
   'facets' => array(
           'availability' => array(
                          'terms' => array('field' => 'availability')
           )
   )
);

$result = $es->query($type, json_encode($query));
?>
   Nicholas Ruflin's elastica
   Raymond Julin's elasticsearch
   Niranjan Uma Shankar's elasticsearch-php
   Very fast indexing
   Auto-scaling architecture
   Elegant REST approach
   Flexible zero configuration model
   Poor documentation
   No feature list, conceptual model or
    introduction
   All data is stored, meaning large indices
   Indexes MySQL, MSSQL, XML or ODBC
   Querying through Sphinx PHP API
   Searching through SQL queries or API
   Scalable to index 6TB of data in 16bn
    documents and 2000 queries/sec
   Used by Craigslist, Boardreader
   Runs as a storage engine in MySQL
   Install from system packages or source
   Source tarball is needed to get PHP
    SphinxAPI
   No other software needed
   Runs as a service in Ubuntu
   Plain index - fast search, slow update
   Real-time index - fast update, less efficient
   Distributed - combination of both methods
index rb_test
{
     # index type
     type = rt
     path = /mnt/data_indexed/sphinx/rb_test
     # define the fields we're indexing
     rt_field = name
     rt_field = subtitle
     rt_field = summary
     rt_field = toc

    #define the fields we want to get back out
    rt_attr_string = item_guid
    rt_attr_string = supplier
    rt_attr_string = product_code
    rt_attr_string = isbn
    rt_attr_string = category
    rt_attr_uint = price
    rt_attr_string = link
    rt_attr_timestamp = publish_date

    # morphology preprocessors to apply
    morphology                          = stem_en
    html_strip                          =1
    html_index_attrs    = img=alt,title; a=title;
    html_remove_elements                = style, script
}
<?php
require_once("mysql.inc.php");
$sql = "SELECT conv(mid(md5(`item_guid`), 1, 16), 16, 10) AS `id`, `item_guid`, `name`,
            `subtitle`, `summary`, `toc`, `supplier`, `product_code`, `isbn`, `category`,
            `price`, `availibility` as `availability`, `type`, `link`, UNIX_TIMESTAMP(`publish_date`) AS
     `publish_date` FROM `rb_search`";
$result = read_query($sql);
$sphinx = mysql_connect("127.0.0.1:9306", "", "", true);
while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) {
     foreach ($row as $key=>$value) {
            $row[$key] = mysql_escape_string($value);
     }
     $sql = "REPLACE INTO `rb_search` (`id`, `title`, `subtitle`,`availability`, `type`, `price`, `publish_date`,
     `item_guid`, `supplier`, `product_code`, `isbn`, `category`, `link`, `summary`, `toc`)
VALUES
            ('{$row['id']}', '{$row['name']}', '{$row['subtitle']}', '{$row['availability']}',
     '{$row['type']}','{$row['price']}', '{$row['publish_date']}', '{$row['item_guid']}', '{$row['supplier']}',
     '{$row['product_code']}', '{$row['isbn']}', '{$row['category']}', '{$row['link']}','{$row['summary']}',
     '{$row['toc']}')";
     mysql_query($sql, $sphinx);
}
?>
mysql --host=127.0.0.1 --port=9306

Welcome to the MySQL monitor. Commands end with ; or g.
Your MySQL connection id is 1
Server version: 2.0.3-id64-release (r3043)

mysql> select item_guid, title, subtitle, price from rb_search where match('china pharmaceutical') and price
     > 100 and price < 300 limit 2G
************************** 1. row ***************************
    id: 5228810066049016302
  weight: 6671
  price: 220
item_guid: cc74cb075aa37696198e87850f033398
  title: North China Pharmaceutical Group Corp-Therapeutic Competitors Report
 subtitle:
*************************** 2. row ***************************
    id: 3548867347418583847
  weight: 6662
  price: 190
item_guid: 6ce04df0fb277aa3ff596c2ca00c81a9
  title: China Pharmaceutical Industry Report
 subtitle: 2006-2007
2 rows in set (0.01 sec)
   Fastest indexing of all engines
   Really simple interface via SQL
   Document IDs must be unsigned integers
   No faceting support
   Good support in forums
   Deployed as a C++ library
   Bindings provided to connect to PHP
   Available in most package repositories
   Binding need to be compiled separately
   Query Parser, similar to other engines
   Stemming and faceted search
   Server replication
   Install from system packages
   Compile PHP bindings from source
   No other software needed
   Runs on demand
   No configuration required
   Define-and-go schema
   Documents
   Terms
   Values
   Document data
<?php
$xapian_db = new XapianWritableDatabase($xapian, Xapian::DB_CREATE_OR_OVERWRITE);
$xapian_term_generator = new XapianTermGenerator();
$xapian_term_generator->set_stemmer(new XapianStem("english"));

while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) {
    $doc = new XapianDocument();
           $xapian_term_generator->set_document($doc);
           foreach ($xapian_term_weights as $field => $weight) {
           $xapian_term_generator->index_text($row[$field], $weight);
           }
    $xapian_term_generator->index_text($row['name'], 75, 'S:');
           $doc->add_boolean_term('CODE:' . $row['product_code']);
    $doc->add_value($xapian_value_slots['price'], Xapian::sortable_serialise($row['price']));
    $doc->add_value($xapian_value_slots['publish_date'], strftime("%Y%m%d",
    strtotime($row['publish_date'])));

     // add in additional values that we're going to use for facets
             $doc->add_value($xapian_value_slots['availability'], $row['availability']);
            $doc->set_data(serialize($doc_data));
            $docid = 'Q'.$row['item_guid'];
            $xapian_db->replace_document($docid, $doc);
}
?>
<?php
$xapian_db = new XapianDatabase($xapian);
$query_parser            = new XapianQueryParser();
$query_parser->set_stemmer(new XapianStem("english"));
$query_parser->set_default_op(XapianQuery::OP_AND);

$dvrProcessor = new XapianDateValueRangeProcessor($xapian_value_slots['publish_date'], 'date:');
$query_parser->add_valuerangeprocessor($dvrProcessor);

$query_parser->add_prefix("code", "CODE:");
$query_parser->add_prefix("category", "CATEGORY:");
$query_parser->add_prefix("title", "S:");
$query = $query_parser->parse_query('“Medical devices” NEAR china NOT russian price:10..150 category:medical');

$enquire = new XapianEnquire($xapian_db);
$enquire->set_query($query);
$matches = $enquire->get_mset($offset, $pagesize);
while (!($start->equals($end))) {
     $doc = $start->get_document();
     $price                = Xapian::sortable_unserialise($doc->get_value($xapian_value_slots['price']));
     $start->next();
}?>
   Only one option available from Xapian
   Requires additional compilation due to
    licensing
   Not very well documented API
   Reasonably fast indexing
   Very flexible implementation
   Faceting and range searching
   Good Quick Start guide
   Responsive mailing list
   Third-party paid support
   Every project has different needs
   Not one search product fits all
   Fastest to index was Sphinx
   Most feature-rich: Solr
   The next steps are up to you

More Related Content

What's hot

Jquery presentation
Jquery presentationJquery presentation
Jquery presentationguest5d87aa6
 
Gta v savegame
Gta v savegameGta v savegame
Gta v savegamehozayfa999
 
Your code sucks, let's fix it
Your code sucks, let's fix itYour code sucks, let's fix it
Your code sucks, let's fix itRafael Dohms
 
PHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object CalisthenicsPHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object CalisthenicsGuilherme Blanco
 
Your code sucks, let's fix it - PHP Master Series 2012
Your code sucks, let's fix it - PHP Master Series 2012Your code sucks, let's fix it - PHP Master Series 2012
Your code sucks, let's fix it - PHP Master Series 2012Rafael Dohms
 
Drupal - dbtng 25th Anniversary Edition
Drupal - dbtng 25th Anniversary EditionDrupal - dbtng 25th Anniversary Edition
Drupal - dbtng 25th Anniversary Editionddiers
 
Lithium: The Framework for People Who Hate Frameworks, Tokyo Edition
Lithium: The Framework for People Who Hate Frameworks, Tokyo EditionLithium: The Framework for People Who Hate Frameworks, Tokyo Edition
Lithium: The Framework for People Who Hate Frameworks, Tokyo EditionNate Abele
 
Your code sucks, let's fix it (CakeFest2012)
Your code sucks, let's fix it (CakeFest2012)Your code sucks, let's fix it (CakeFest2012)
Your code sucks, let's fix it (CakeFest2012)Rafael Dohms
 
Doctrine 2
Doctrine 2Doctrine 2
Doctrine 2zfconfua
 
PHP tips and tricks
PHP tips and tricks PHP tips and tricks
PHP tips and tricks Damien Seguy
 
Php code for online quiz
Php code for online quizPhp code for online quiz
Php code for online quizhnyb1002
 
Drupal II: The SQL
Drupal II: The SQLDrupal II: The SQL
Drupal II: The SQLddiers
 
PHP Data Objects
PHP Data ObjectsPHP Data Objects
PHP Data ObjectsWez Furlong
 
Erlang for data ops
Erlang for data opsErlang for data ops
Erlang for data opsmnacos
 
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014NoSQLmatters
 

What's hot (18)

Jquery presentation
Jquery presentationJquery presentation
Jquery presentation
 
Php 101: PDO
Php 101: PDOPhp 101: PDO
Php 101: PDO
 
Gta v savegame
Gta v savegameGta v savegame
Gta v savegame
 
Your code sucks, let's fix it
Your code sucks, let's fix itYour code sucks, let's fix it
Your code sucks, let's fix it
 
PHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object CalisthenicsPHP for Adults: Clean Code and Object Calisthenics
PHP for Adults: Clean Code and Object Calisthenics
 
Your code sucks, let's fix it - PHP Master Series 2012
Your code sucks, let's fix it - PHP Master Series 2012Your code sucks, let's fix it - PHP Master Series 2012
Your code sucks, let's fix it - PHP Master Series 2012
 
Drupal - dbtng 25th Anniversary Edition
Drupal - dbtng 25th Anniversary EditionDrupal - dbtng 25th Anniversary Edition
Drupal - dbtng 25th Anniversary Edition
 
Lithium: The Framework for People Who Hate Frameworks, Tokyo Edition
Lithium: The Framework for People Who Hate Frameworks, Tokyo EditionLithium: The Framework for People Who Hate Frameworks, Tokyo Edition
Lithium: The Framework for People Who Hate Frameworks, Tokyo Edition
 
Your code sucks, let's fix it (CakeFest2012)
Your code sucks, let's fix it (CakeFest2012)Your code sucks, let's fix it (CakeFest2012)
Your code sucks, let's fix it (CakeFest2012)
 
Doctrine 2
Doctrine 2Doctrine 2
Doctrine 2
 
PHP tips and tricks
PHP tips and tricks PHP tips and tricks
PHP tips and tricks
 
Php code for online quiz
Php code for online quizPhp code for online quiz
Php code for online quiz
 
Drupal7 dbtng
Drupal7  dbtngDrupal7  dbtng
Drupal7 dbtng
 
Drupal II: The SQL
Drupal II: The SQLDrupal II: The SQL
Drupal II: The SQL
 
PHP Data Objects
PHP Data ObjectsPHP Data Objects
PHP Data Objects
 
Erlang for data ops
Erlang for data opsErlang for data ops
Erlang for data ops
 
Drupal 8 database api
Drupal 8 database apiDrupal 8 database api
Drupal 8 database api
 
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
 

Viewers also liked

Search search search
Search search searchSearch search search
Search search searchAndy Dai
 
Poitou charentes JUG - Elasticsearch
Poitou charentes JUG - ElasticsearchPoitou charentes JUG - Elasticsearch
Poitou charentes JUG - ElasticsearchDavid Pilato
 
The original vision of Nutch, 14 years later: Building an open source search ...
The original vision of Nutch, 14 years later: Building an open source search ...The original vision of Nutch, 14 years later: Building an open source search ...
The original vision of Nutch, 14 years later: Building an open source search ...Sylvain Zimmer
 
Elasticsearch
ElasticsearchElasticsearch
Elasticsearchnewegg
 
Comparing open source search engines
Comparing open source search enginesComparing open source search engines
Comparing open source search enginesRichard Boulton
 
Elasticsearch Arcihtecture & What's New in Version 5
Elasticsearch Arcihtecture & What's New in Version 5Elasticsearch Arcihtecture & What's New in Version 5
Elasticsearch Arcihtecture & What's New in Version 5Burak TUNGUT
 
Oxalide Workshop #3 - Elasticearch, an overview
Oxalide Workshop #3 - Elasticearch, an overviewOxalide Workshop #3 - Elasticearch, an overview
Oxalide Workshop #3 - Elasticearch, an overviewLudovic Piot
 
Oxalide Workshop #4 - Docker, des tours dans le petit bassin
Oxalide Workshop #4 - Docker, des tours dans le petit bassinOxalide Workshop #4 - Docker, des tours dans le petit bassin
Oxalide Workshop #4 - Docker, des tours dans le petit bassinOxalide
 

Viewers also liked (11)

Search search search
Search search searchSearch search search
Search search search
 
Poitou charentes JUG - Elasticsearch
Poitou charentes JUG - ElasticsearchPoitou charentes JUG - Elasticsearch
Poitou charentes JUG - Elasticsearch
 
Introducing ElasticSearch - Ashish
Introducing ElasticSearch - AshishIntroducing ElasticSearch - Ashish
Introducing ElasticSearch - Ashish
 
The original vision of Nutch, 14 years later: Building an open source search ...
The original vision of Nutch, 14 years later: Building an open source search ...The original vision of Nutch, 14 years later: Building an open source search ...
The original vision of Nutch, 14 years later: Building an open source search ...
 
Elasticsearch
ElasticsearchElasticsearch
Elasticsearch
 
Comparing open source search engines
Comparing open source search enginesComparing open source search engines
Comparing open source search engines
 
Elastic search
Elastic searchElastic search
Elastic search
 
Elasticsearch Arcihtecture & What's New in Version 5
Elasticsearch Arcihtecture & What's New in Version 5Elasticsearch Arcihtecture & What's New in Version 5
Elasticsearch Arcihtecture & What's New in Version 5
 
Oxalide Workshop #3 - Elasticearch, an overview
Oxalide Workshop #3 - Elasticearch, an overviewOxalide Workshop #3 - Elasticearch, an overview
Oxalide Workshop #3 - Elasticearch, an overview
 
Oxalide Workshop #4 - Docker, des tours dans le petit bassin
Oxalide Workshop #4 - Docker, des tours dans le petit bassinOxalide Workshop #4 - Docker, des tours dans le petit bassin
Oxalide Workshop #4 - Docker, des tours dans le petit bassin
 
(Elastic)search in big data
(Elastic)search in big data(Elastic)search in big data
(Elastic)search in big data
 

Similar to Open Source Search: An Analysis

Propel sfugmd
Propel sfugmdPropel sfugmd
Propel sfugmdiKlaus
 
The Zen of Lithium
The Zen of LithiumThe Zen of Lithium
The Zen of LithiumNate Abele
 
第49回Php勉強会@関東 Datasource
第49回Php勉強会@関東 Datasource第49回Php勉強会@関東 Datasource
第49回Php勉強会@関東 DatasourceKaz Watanabe
 
The State of Lithium
The State of LithiumThe State of Lithium
The State of LithiumNate Abele
 
From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)Night Sailer
 
Unit testing with zend framework tek11
Unit testing with zend framework tek11Unit testing with zend framework tek11
Unit testing with zend framework tek11Michelangelo van Dam
 
PostgreSQL's Secret NoSQL Superpowers
PostgreSQL's Secret NoSQL SuperpowersPostgreSQL's Secret NoSQL Superpowers
PostgreSQL's Secret NoSQL SuperpowersAmanda Gilmore
 
Hidden treasures of Ruby
Hidden treasures of RubyHidden treasures of Ruby
Hidden treasures of RubyTom Crinson
 
Unit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBeneluxUnit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBeneluxMichelangelo van Dam
 
Bag Of Tricks From Iusethis
Bag Of Tricks From IusethisBag Of Tricks From Iusethis
Bag Of Tricks From IusethisMarcus Ramberg
 
WordCamp Portland 2018: PHP for WordPress
WordCamp Portland 2018: PHP for WordPressWordCamp Portland 2018: PHP for WordPress
WordCamp Portland 2018: PHP for WordPressAlena Holligan
 
PHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHP
PHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHPPHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHP
PHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHPiMasters
 
Solr's Search Relevancy (Understand Solr's query debug)
Solr's Search Relevancy (Understand Solr's query debug)Solr's Search Relevancy (Understand Solr's query debug)
Solr's Search Relevancy (Understand Solr's query debug)Wongnai
 
Advanced Php - Macq Electronique 2010
Advanced Php - Macq Electronique 2010Advanced Php - Macq Electronique 2010
Advanced Php - Macq Electronique 2010Michelangelo van Dam
 
laravel tricks in 50minutes
laravel tricks in 50minuteslaravel tricks in 50minutes
laravel tricks in 50minutesBarang CK
 
50 Laravel Tricks in 50 Minutes
50 Laravel Tricks in 50 Minutes50 Laravel Tricks in 50 Minutes
50 Laravel Tricks in 50 MinutesAzim Kurt
 
Let's write secure Drupal code! - DrupalCamp Oslo, 2018
Let's write secure Drupal code! - DrupalCamp Oslo, 2018Let's write secure Drupal code! - DrupalCamp Oslo, 2018
Let's write secure Drupal code! - DrupalCamp Oslo, 2018Balázs Tatár
 
Php 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the GoodPhp 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the GoodJeremy Kendall
 

Similar to Open Source Search: An Analysis (20)

Propel sfugmd
Propel sfugmdPropel sfugmd
Propel sfugmd
 
The Zen of Lithium
The Zen of LithiumThe Zen of Lithium
The Zen of Lithium
 
第49回Php勉強会@関東 Datasource
第49回Php勉強会@関東 Datasource第49回Php勉強会@関東 Datasource
第49回Php勉強会@関東 Datasource
 
The State of Lithium
The State of LithiumThe State of Lithium
The State of Lithium
 
Broadleaf Presents Thymeleaf
Broadleaf Presents ThymeleafBroadleaf Presents Thymeleaf
Broadleaf Presents Thymeleaf
 
From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)
 
Unit testing with zend framework tek11
Unit testing with zend framework tek11Unit testing with zend framework tek11
Unit testing with zend framework tek11
 
PostgreSQL's Secret NoSQL Superpowers
PostgreSQL's Secret NoSQL SuperpowersPostgreSQL's Secret NoSQL Superpowers
PostgreSQL's Secret NoSQL Superpowers
 
Hidden treasures of Ruby
Hidden treasures of RubyHidden treasures of Ruby
Hidden treasures of Ruby
 
Unit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBeneluxUnit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBenelux
 
Bag Of Tricks From Iusethis
Bag Of Tricks From IusethisBag Of Tricks From Iusethis
Bag Of Tricks From Iusethis
 
WordCamp Portland 2018: PHP for WordPress
WordCamp Portland 2018: PHP for WordPressWordCamp Portland 2018: PHP for WordPress
WordCamp Portland 2018: PHP for WordPress
 
Database api
Database apiDatabase api
Database api
 
PHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHP
PHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHPPHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHP
PHP Experience 2016 - [Workshop] Elastic Search: Turbinando sua aplicação PHP
 
Solr's Search Relevancy (Understand Solr's query debug)
Solr's Search Relevancy (Understand Solr's query debug)Solr's Search Relevancy (Understand Solr's query debug)
Solr's Search Relevancy (Understand Solr's query debug)
 
Advanced Php - Macq Electronique 2010
Advanced Php - Macq Electronique 2010Advanced Php - Macq Electronique 2010
Advanced Php - Macq Electronique 2010
 
laravel tricks in 50minutes
laravel tricks in 50minuteslaravel tricks in 50minutes
laravel tricks in 50minutes
 
50 Laravel Tricks in 50 Minutes
50 Laravel Tricks in 50 Minutes50 Laravel Tricks in 50 Minutes
50 Laravel Tricks in 50 Minutes
 
Let's write secure Drupal code! - DrupalCamp Oslo, 2018
Let's write secure Drupal code! - DrupalCamp Oslo, 2018Let's write secure Drupal code! - DrupalCamp Oslo, 2018
Let's write secure Drupal code! - DrupalCamp Oslo, 2018
 
Php 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the GoodPhp 102: Out with the Bad, In with the Good
Php 102: Out with the Bad, In with the Good
 

Recently uploaded

From Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time AutomationFrom Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time AutomationSafe Software
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slidevu2urc
 
Top 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live StreamsTop 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live StreamsRoshan Dwivedi
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Miguel Araújo
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024The Digital Insurer
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsMaria Levchenko
 
Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Allon Mureinik
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024Rafal Los
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdfhans926745
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)Gabriella Davis
 
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptxEIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptxEarley Information Science
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Scriptwesley chun
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationMichael W. Hawkins
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Igalia
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...apidays
 
08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking MenDelhi Call girls
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfEnterprise Knowledge
 
Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024The Digital Insurer
 
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptxHampshireHUG
 
Exploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone ProcessorsExploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone Processorsdebabhi2
 

Recently uploaded (20)

From Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time AutomationFrom Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time Automation
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slide
 
Top 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live StreamsTop 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live Streams
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024
 
[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf[2024]Digital Global Overview Report 2024 Meltwater.pdf
[2024]Digital Global Overview Report 2024 Meltwater.pdf
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)
 
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptxEIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
EIS-Webinar-Prompt-Knowledge-Eng-2024-04-08.pptx
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Script
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day Presentation
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
 
08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
 
Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024
 
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
04-2024-HHUG-Sales-and-Marketing-Alignment.pptx
 
Exploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone ProcessorsExploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone Processors
 

Open Source Search: An Analysis

  • 1. An analysis and comparison from a developer’s perspective
  • 2.
  • 3. Report Buyer product catalogue: • Text fields: title, subtitle, summary, toc • Product code and ISBN • Supplier, category, type and availability • Publication date and price
  • 4. Enterprise class search engine Scalable and based on Apache Lucene REST-ful API or PECL extension Fast, transactional full-text indexing Faceted and geospatial search Rich document indexing Comes with simple web interface Built-in caching of queries and responses Numerous plug-ins
  • 5. Available as system packages  Uses Tomcat or Jetty  Requires a restart on configuration change  Packages install as a service
  • 6. Specify database location  Memory settings  Query caching options  Request handler setup  Search components and plug-ins  Spell checker configuration
  • 7. <!-- Report Buyer fields --> <field name="item_guid" type="string" indexed="true" stored="true" required="true" /> <field name="name" type="text" indexed="true" stored="true" required="true" boost="75" omitNorms="false" /> <field name="subtitle" type="text" indexed="true" stored="true" required="false" boost="25" omitNorms="false" /> <field name="summary" type="text" indexed="true" stored="false" boost="1" omitNorms="false" /> <field name="toc" type="text" indexed="true" stored="false" boost="1" omitNorms="false" /> <field name="isbn" type="string" indexed="true" stored="false" boost="200" omitNorms="false" /> <field name="product_code" type="string" indexed="true" stored="true" boost="200" omitNorms="false" /> <field name="publish_date" type="tdate" indexed="true" stored="true" /> <field name="price" type="tfloat" indexed="true" stored="true" /> <field name="availability" type="boolean" indexed="true" stored="true" /> <field name="link" type="string" indexed="false" stored="true" /> <field name="text" type="text" indexed="true" stored="false" multiValued="true"/> <copyField source="name" dest="text"/> <copyField source="subtitle" dest="text"/> <copyField source="summary" dest="text"/> <copyField source="toc" dest="text"/> <uniqueKey>item_guid</uniqueKey> <defaultSearchField>text</defaultSearchField>
  • 8. Data Import Handler  REST-ful API  PHP PECL Extension  Third-party libraries, like Solarium
  • 9. <?php $solr_options = array('secure' => false, 'hostname' => 'localhost', 'port' => 8080); $solr = new SolrClient($solr_options); $doc = new SolrInputDocument(); while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) { $doc = new SolrInputDocument(); $row['publish_date'] = strftime('%Y-%m-%dT00:00:01Z', strtotime($row['publish_date'])); foreach ($row as $key => $value) { $doc->addField($key, $value); } $updateResponse = $solr->addDocument($doc); $response = $updateResponse->getResponse(); if ($response->responseHeader->status != 0) { print "Error importing into Solr: "; print_r($response); } } $solr->commit(); ?>
  • 10. POST to http://localhost:8080/solr/update?commit=true <add> <doc> <field name="item_guid">a34bbff9e17ada79658c72fde90c7369</field> <field name="name">Research Report on China's Corn Industry</field> <field name="price">1265</field> etc </doc> </add>
  • 11. $solr_options = array('secure' => false, 'hostname' => 'localhost', 'port' => 8080); $solr = new SolrClient($solr_options); $query = new SolrQuery(); $query->setQuery("research in china"); $query->setFacet(true); $query->addFacetField('availability'); $query->addField('item_guid')->addField('name')->addField('publish_date')->addField('subtitle')-> addField('product_code')->addField('availability')->addField('price'); $query->addSortField('publish_date', SolrQuery::ORDER_DESC); $query_response = $solr->query($query); $response = $query_response->getResponse(); print "Found ".$response->response->numFound." results, for {$query_string} in ".$response- >responseHeader->QTime." ms:nn"; foreach ($response->response->docs as $position=>$doc_data) { $download = ($doc_data['availability'] == '1') ? 'Yes' : 'No'; print "{$position} - Date:{$pub_date} - {$doc_data['product_code']} - D/L:{$download} £".sprintf("%5d", $doc_data['price'])." - {$doc_data['name']}n"; } print "Facets for instant ".$response->facet_counts->facet_fields->availability->false;
  • 12. http://localhost:8080/solr/select/?q=research%20%in%20china&indent=on&hl=true&hl.fl=item_guid,name, publish_date,subtitle,product_code,availability,price&facet=true&facet.field=availability&wt=json { "responseHeader":{ "status":0, "QTime":20, "params":{ "facet":"true", "indent":"on", "q":"research u0000 china", "hl.fl":"item_guid,name,publish_date,subtitle,product_code,availability,price", "facet.field":"availability", "wt":"json", "hl":"true"}}, "response":{"numFound":197481,"start":0,"docs":[ { "item_guid":"e68cf64921a02e926137d78d2c52da35", "name":"Market Research Report on China Civil Aero Industry", "product_code":"SFC00076", "price":190.0, "availability":false, "type":10, "link": "/industry_manufacturing/plant_heavy_equipment/market_research_report_china_civil_aero_industry. html", "publish_date":"2008-07-22T00:00:01Z" } }
  • 13. More features than other products  Responsive, busy mailing list  Large team of developers  Good PHP libraries for integration  Several books available  Fairly heavy footprint
  • 14. Also built on Apache Lucene  JSON-based  Distributed, scalable server model  Easy to configure, or configuration free  Faceting and highlight support  Auto type detection  Multiple indexes  CouchDB integration
  • 15. Download and unpack zip file  Run elasticsearch/bin/elasticsearch
  • 16. No schema is required - almost  No configuration is required - almost
  • 17. GET http://localhost:9200/ HTTP/1.0 { "ok" : true, "name" : "Test", "version" : { "number" : "0.18.7", "snapshot_build" : false }, "tagline" : "You Know, for Search", "cover" : "DON'T PANIC", "quote" : { "book" : "The Hitchhiker's Guide to the Galaxy", "chapter" : "Chapter 27", "text1" : ""Forty-two," said Deep Thought, with infinite majesty and calm.", "text2" : ""The Answer to the Great Question, of Life, the Universe and Everything"" } }
  • 18. curl -XPUT http://localhost:9200/reports/ -d ' { "index:" { "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "standard", "filter": ["standard", "lowercase", "my_stemmer"] } }, "filter": { "my_stemmer": { "type": "stemmer", "name": "english" } } } } }'
  • 19. <?php require_once("ElasticSearch.php"); $es = new ElasticSearch; $es->index = 'reports'; $type = 'report'; $mappings = array($type => array('properties' => array( '_id' => array('type' => 'string', 'path' => 'item_guid'), 'item_guid' => array('type' => 'string', 'store' => 'yes', 'index' => 'not_analyzed'), 'name' => array('type' => 'string', 'store' => 'no', 'boost' => 75), 'subtitle' => array('type' => 'string', 'store' => 'yes', 'boost' => 25), 'summary' => array('type' => 'string', 'store' => 'yes', 'boost' => 10), 'toc' => array('type' => 'string', 'store' => 'no'), 'product_code' => array('type' => 'string', 'store' => 'yes', 'boost' => 200, 'index' => 'not_analyzed'), 'isbn' => array('type' => 'string', 'store' => 'yes', 'boost' => 200, 'index' => 'not_analyzed'), ))); $json = json_encode($mappings); $es->map($type, $json); ?>
  • 20. <?php require_once("ElasticSearch.php"); $es = new ElasticSearch; $es->index = 'reports'; $type = 'report'; $sql = "SELECT `item_guid`, `name`, `subtitle`, `summary`, `toc`, `supplier`, `product_code`, `isbn`, `category`, `price`, `availibility` as `availability`, `type`, `link`, `publish_date` FROM `rb_search`"; $result = read_query($sql); while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) { $es->add($type, $row['item_guid'], json_encode($row)); } ?>
  • 22. <?php require_once("ElasticSearch.php"); $es = new ElasticSearch; $es->index = 'reports'; $type = 'report'; $query = array( 'fields' => array('item_guid', 'name', 'subtitle'), 'query' => array( 'term' => array('name' => 'research'), ), 'facets' => array( 'availability' => array( 'terms' => array('field' => 'availability') ) ) ); $result = $es->query($type, json_encode($query)); ?>
  • 23. Nicholas Ruflin's elastica  Raymond Julin's elasticsearch  Niranjan Uma Shankar's elasticsearch-php
  • 24. Very fast indexing  Auto-scaling architecture  Elegant REST approach  Flexible zero configuration model  Poor documentation  No feature list, conceptual model or introduction  All data is stored, meaning large indices
  • 25. Indexes MySQL, MSSQL, XML or ODBC  Querying through Sphinx PHP API  Searching through SQL queries or API  Scalable to index 6TB of data in 16bn documents and 2000 queries/sec  Used by Craigslist, Boardreader  Runs as a storage engine in MySQL
  • 26. Install from system packages or source  Source tarball is needed to get PHP SphinxAPI  No other software needed  Runs as a service in Ubuntu
  • 27. Plain index - fast search, slow update  Real-time index - fast update, less efficient  Distributed - combination of both methods
  • 28. index rb_test { # index type type = rt path = /mnt/data_indexed/sphinx/rb_test # define the fields we're indexing rt_field = name rt_field = subtitle rt_field = summary rt_field = toc #define the fields we want to get back out rt_attr_string = item_guid rt_attr_string = supplier rt_attr_string = product_code rt_attr_string = isbn rt_attr_string = category rt_attr_uint = price rt_attr_string = link rt_attr_timestamp = publish_date # morphology preprocessors to apply morphology = stem_en html_strip =1 html_index_attrs = img=alt,title; a=title; html_remove_elements = style, script }
  • 29. <?php require_once("mysql.inc.php"); $sql = "SELECT conv(mid(md5(`item_guid`), 1, 16), 16, 10) AS `id`, `item_guid`, `name`, `subtitle`, `summary`, `toc`, `supplier`, `product_code`, `isbn`, `category`, `price`, `availibility` as `availability`, `type`, `link`, UNIX_TIMESTAMP(`publish_date`) AS `publish_date` FROM `rb_search`"; $result = read_query($sql); $sphinx = mysql_connect("127.0.0.1:9306", "", "", true); while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) { foreach ($row as $key=>$value) { $row[$key] = mysql_escape_string($value); } $sql = "REPLACE INTO `rb_search` (`id`, `title`, `subtitle`,`availability`, `type`, `price`, `publish_date`, `item_guid`, `supplier`, `product_code`, `isbn`, `category`, `link`, `summary`, `toc`) VALUES ('{$row['id']}', '{$row['name']}', '{$row['subtitle']}', '{$row['availability']}', '{$row['type']}','{$row['price']}', '{$row['publish_date']}', '{$row['item_guid']}', '{$row['supplier']}', '{$row['product_code']}', '{$row['isbn']}', '{$row['category']}', '{$row['link']}','{$row['summary']}', '{$row['toc']}')"; mysql_query($sql, $sphinx); } ?>
  • 30. mysql --host=127.0.0.1 --port=9306 Welcome to the MySQL monitor. Commands end with ; or g. Your MySQL connection id is 1 Server version: 2.0.3-id64-release (r3043) mysql> select item_guid, title, subtitle, price from rb_search where match('china pharmaceutical') and price > 100 and price < 300 limit 2G ************************** 1. row *************************** id: 5228810066049016302 weight: 6671 price: 220 item_guid: cc74cb075aa37696198e87850f033398 title: North China Pharmaceutical Group Corp-Therapeutic Competitors Report subtitle: *************************** 2. row *************************** id: 3548867347418583847 weight: 6662 price: 190 item_guid: 6ce04df0fb277aa3ff596c2ca00c81a9 title: China Pharmaceutical Industry Report subtitle: 2006-2007 2 rows in set (0.01 sec)
  • 31. Fastest indexing of all engines  Really simple interface via SQL  Document IDs must be unsigned integers  No faceting support  Good support in forums
  • 32. Deployed as a C++ library  Bindings provided to connect to PHP  Available in most package repositories  Binding need to be compiled separately  Query Parser, similar to other engines  Stemming and faceted search  Server replication
  • 33. Install from system packages  Compile PHP bindings from source  No other software needed  Runs on demand
  • 34. No configuration required  Define-and-go schema  Documents  Terms  Values  Document data
  • 35. <?php $xapian_db = new XapianWritableDatabase($xapian, Xapian::DB_CREATE_OR_OVERWRITE); $xapian_term_generator = new XapianTermGenerator(); $xapian_term_generator->set_stemmer(new XapianStem("english")); while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) { $doc = new XapianDocument(); $xapian_term_generator->set_document($doc); foreach ($xapian_term_weights as $field => $weight) { $xapian_term_generator->index_text($row[$field], $weight); } $xapian_term_generator->index_text($row['name'], 75, 'S:'); $doc->add_boolean_term('CODE:' . $row['product_code']); $doc->add_value($xapian_value_slots['price'], Xapian::sortable_serialise($row['price'])); $doc->add_value($xapian_value_slots['publish_date'], strftime("%Y%m%d", strtotime($row['publish_date']))); // add in additional values that we're going to use for facets $doc->add_value($xapian_value_slots['availability'], $row['availability']); $doc->set_data(serialize($doc_data)); $docid = 'Q'.$row['item_guid']; $xapian_db->replace_document($docid, $doc); } ?>
  • 36. <?php $xapian_db = new XapianDatabase($xapian); $query_parser = new XapianQueryParser(); $query_parser->set_stemmer(new XapianStem("english")); $query_parser->set_default_op(XapianQuery::OP_AND); $dvrProcessor = new XapianDateValueRangeProcessor($xapian_value_slots['publish_date'], 'date:'); $query_parser->add_valuerangeprocessor($dvrProcessor); $query_parser->add_prefix("code", "CODE:"); $query_parser->add_prefix("category", "CATEGORY:"); $query_parser->add_prefix("title", "S:"); $query = $query_parser->parse_query('“Medical devices” NEAR china NOT russian price:10..150 category:medical'); $enquire = new XapianEnquire($xapian_db); $enquire->set_query($query); $matches = $enquire->get_mset($offset, $pagesize); while (!($start->equals($end))) { $doc = $start->get_document(); $price = Xapian::sortable_unserialise($doc->get_value($xapian_value_slots['price'])); $start->next(); }?>
  • 37. Only one option available from Xapian  Requires additional compilation due to licensing  Not very well documented API
  • 38. Reasonably fast indexing  Very flexible implementation  Faceting and range searching  Good Quick Start guide  Responsive mailing list  Third-party paid support
  • 39. Every project has different needs  Not one search product fits all  Fastest to index was Sphinx  Most feature-rich: Solr  The next steps are up to you