commit
This commit is contained in:
94
solr/example/README.md
Normal file
94
solr/example/README.md
Normal file
@@ -0,0 +1,94 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
Solr example
|
||||
------------
|
||||
|
||||
This directory contains Solr examples. Each example is contained in a
|
||||
separate directory. To run a specific example, do:
|
||||
|
||||
```
|
||||
bin/solr start -e <EXAMPLE> where <EXAMPLE> is one of:
|
||||
|
||||
cloud : SolrCloud example
|
||||
schemaless : Schema-less example (schema is inferred from data during indexing)
|
||||
techproducts : Kitchen sink example providing comprehensive examples of Solr features
|
||||
films : Example of starting with _default configset and defining explicit fields dynamically.
|
||||
```
|
||||
|
||||
For instance, if you want to run the SolrCloud example, do:
|
||||
|
||||
```
|
||||
bin/solr start -e cloud
|
||||
```
|
||||
|
||||
To see all the options available when starting Solr:
|
||||
|
||||
```
|
||||
bin/solr start -help
|
||||
```
|
||||
|
||||
After starting a Solr example, direct your Web browser to:
|
||||
|
||||
```
|
||||
http://localhost:8983/solr/
|
||||
```
|
||||
|
||||
To add documents to the index, use bin/solr post, for example:
|
||||
|
||||
```
|
||||
bin/solr post -c techproducts example/exampledocs/*.xml
|
||||
```
|
||||
|
||||
(where "techproducts" is the Solr core name)
|
||||
|
||||
For more information about this example please read...
|
||||
|
||||
* [solr/example/README.md](./README.md)
|
||||
|
||||
For more information about the "Solr Home" and Solr specific configuration
|
||||
|
||||
* https://solr.apache.org/guide/solr-tutorial.html
|
||||
|
||||
For a Solr tutorial
|
||||
|
||||
* https://solr.apache.org/resources.html
|
||||
|
||||
For a list of other tutorials and introductory articles.
|
||||
|
||||
Notes About These Examples
|
||||
--------------------------
|
||||
|
||||
### References to Jar Files Outside This Directory
|
||||
|
||||
Various example SolrHome dirs contained in this directory may use "<lib>"
|
||||
statements in the solrconfig.xml file to reference plugin jars outside of
|
||||
this directory for loading modules via relative paths.
|
||||
|
||||
If you make a copy of this example server and wish to use the
|
||||
ExtractingRequestHandler (SolrCell), the clustering component,
|
||||
or any other modules, you will need to
|
||||
copy the required jars or update the paths to those jars in your
|
||||
solrconfig.xml.
|
||||
|
||||
### Logging
|
||||
|
||||
By default, Jetty & Solr will log to the console and logs/solr.log. This can
|
||||
be convenient when first getting started, but eventually you will want to
|
||||
log just to a file. To configure logging, edit the log4j2.xml file in
|
||||
".../server/resources".
|
||||
It is also possible to setup log4j or other popular logging frameworks.
|
||||
11
solr/example/exampledocs/books.csv
Normal file
11
solr/example/exampledocs/books.csv
Normal file
@@ -0,0 +1,11 @@
|
||||
id,cat,name,price,inStock,author,series_t,sequence_i,genre_s
|
||||
0553573403,book,A Game of Thrones,7.99,true,George R.R. Martin,"A Song of Ice and Fire",1,fantasy
|
||||
0553579908,book,A Clash of Kings,7.99,true,George R.R. Martin,"A Song of Ice and Fire",2,fantasy
|
||||
055357342X,book,A Storm of Swords,7.99,true,George R.R. Martin,"A Song of Ice and Fire",3,fantasy
|
||||
0553293354,book,Foundation,7.99,true,Isaac Asimov,Foundation Novels,1,scifi
|
||||
0812521390,book,The Black Company,6.99,false,Glen Cook,The Chronicles of The Black Company,1,fantasy
|
||||
0812550706,book,Ender's Game,6.99,true,Orson Scott Card,Ender,1,scifi
|
||||
0441385532,book,Jhereg,7.95,false,Steven Brust,Vlad Taltos,1,fantasy
|
||||
0380014300,book,Nine Princes In Amber,6.99,true,Roger Zelazny,the Chronicles of Amber,1,fantasy
|
||||
0805080481,book,The Book of Three,5.99,true,Lloyd Alexander,The Chronicles of Prydain,1,fantasy
|
||||
080508049X,book,The Black Cauldron,5.99,true,Lloyd Alexander,The Chronicles of Prydain,2,fantasy
|
||||
|
51
solr/example/exampledocs/books.json
Normal file
51
solr/example/exampledocs/books.json
Normal file
@@ -0,0 +1,51 @@
|
||||
[
|
||||
{
|
||||
"id" : "978-0641723445",
|
||||
"cat" : ["book","hardcover"],
|
||||
"name" : "The Lightning Thief",
|
||||
"author" : "Rick Riordan",
|
||||
"series_t" : "Percy Jackson and the Olympians",
|
||||
"sequence_i" : 1,
|
||||
"genre_s" : "fantasy",
|
||||
"inStock" : true,
|
||||
"price" : 12.50,
|
||||
"pages_i" : 384
|
||||
}
|
||||
,
|
||||
{
|
||||
"id" : "978-1423103349",
|
||||
"cat" : ["book","paperback"],
|
||||
"name" : "The Sea of Monsters",
|
||||
"author" : "Rick Riordan",
|
||||
"series_t" : "Percy Jackson and the Olympians",
|
||||
"sequence_i" : 2,
|
||||
"genre_s" : "fantasy",
|
||||
"inStock" : true,
|
||||
"price" : 6.49,
|
||||
"pages_i" : 304
|
||||
}
|
||||
,
|
||||
{
|
||||
"id" : "978-1857995879",
|
||||
"cat" : ["book","paperback"],
|
||||
"name" : "Sophie's World : The Greek Philosophers",
|
||||
"author" : "Jostein Gaarder",
|
||||
"sequence_i" : 1,
|
||||
"genre_s" : "fantasy",
|
||||
"inStock" : true,
|
||||
"price" : 3.07,
|
||||
"pages_i" : 64
|
||||
}
|
||||
,
|
||||
{
|
||||
"id" : "978-1933988177",
|
||||
"cat" : ["book","paperback"],
|
||||
"name" : "Lucene in Action, Second Edition",
|
||||
"author" : "Michael McCandless",
|
||||
"sequence_i" : 1,
|
||||
"genre_s" : "IT",
|
||||
"inStock" : true,
|
||||
"price" : 30.50,
|
||||
"pages_i" : 475
|
||||
}
|
||||
]
|
||||
32
solr/example/exampledocs/gb18030-example.xml
Normal file
32
solr/example/exampledocs/gb18030-example.xml
Normal file
@@ -0,0 +1,32 @@
|
||||
<?xml version="1.0" encoding="GB18030"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add>
|
||||
<doc>
|
||||
<field name="id">GB18030TEST</field>
|
||||
<field name="name">Test with some GB18030 encoded characters</field>
|
||||
<field name="features">No accents here</field>
|
||||
<field name="features">这是一个功能</field>
|
||||
<field name="features">This is a feature (translated)</field>
|
||||
<field name="features">这份文件是很有光泽</field>
|
||||
<field name="features">This document is very shiny (translated)</field>
|
||||
<field name="price">0.0</field>
|
||||
<field name="inStock">true</field>
|
||||
</doc>
|
||||
</add>
|
||||
|
||||
56
solr/example/exampledocs/hd.xml
Normal file
56
solr/example/exampledocs/hd.xml
Normal file
@@ -0,0 +1,56 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add>
|
||||
<doc>
|
||||
<field name="id">SP2514N</field>
|
||||
<field name="name">Samsung SpinPoint P120 SP2514N - hard drive - 250 GB - ATA-133</field>
|
||||
<field name="manu">Samsung Electronics Co. Ltd.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">samsung</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">hard drive</field>
|
||||
<field name="features">7200RPM, 8MB cache, IDE Ultra ATA-133</field>
|
||||
<field name="features">NoiseGuard, SilentSeek technology, Fluid Dynamic Bearing (FDB) motor</field>
|
||||
<field name="price">92.0</field>
|
||||
<field name="popularity">6</field>
|
||||
<field name="inStock">true</field>
|
||||
<field name="manufacturedate_dt">2006-02-13T15:26:37Z</field>
|
||||
<!-- Near Oklahoma city -->
|
||||
<field name="store">35.0752,-97.032</field>
|
||||
</doc>
|
||||
|
||||
<doc>
|
||||
<field name="id">6H500F0</field>
|
||||
<field name="name">Maxtor DiamondMax 11 - hard drive - 500 GB - SATA-300</field>
|
||||
<field name="manu">Maxtor Corp.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">maxtor</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">hard drive</field>
|
||||
<field name="features">SATA 3.0Gb/s, NCQ</field>
|
||||
<field name="features">8.5ms seek</field>
|
||||
<field name="features">16MB cache</field>
|
||||
<field name="price">350.0</field>
|
||||
<field name="popularity">6</field>
|
||||
<field name="inStock">true</field>
|
||||
<!-- Buffalo store -->
|
||||
<field name="store">45.17614,-93.87341</field>
|
||||
<field name="manufacturedate_dt">2006-02-13T15:26:37Z</field>
|
||||
</doc>
|
||||
</add>
|
||||
|
||||
60
solr/example/exampledocs/ipod_other.xml
Normal file
60
solr/example/exampledocs/ipod_other.xml
Normal file
@@ -0,0 +1,60 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add>
|
||||
|
||||
<doc>
|
||||
<field name="id">F8V7067-APL-KIT</field>
|
||||
<field name="name">Belkin Mobile Power Cord for iPod w/ Dock</field>
|
||||
<field name="manu">Belkin</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">belkin</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">connector</field>
|
||||
<field name="features">car power adapter, white</field>
|
||||
<field name="weight">4.0</field>
|
||||
<field name="price">19.95</field>
|
||||
<field name="popularity">1</field>
|
||||
<field name="inStock">false</field>
|
||||
<!-- Buffalo store -->
|
||||
<field name="store">45.18014,-93.87741</field>
|
||||
<field name="manufacturedate_dt">2005-08-01T16:30:25Z</field>
|
||||
</doc>
|
||||
|
||||
<doc>
|
||||
<field name="id">IW-02</field>
|
||||
<field name="name">iPod & iPod Mini USB 2.0 Cable</field>
|
||||
<field name="manu">Belkin</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">belkin</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">connector</field>
|
||||
<field name="features">car power adapter for iPod, white</field>
|
||||
<field name="weight">2.0</field>
|
||||
<field name="price">11.50</field>
|
||||
<field name="popularity">1</field>
|
||||
<field name="inStock">false</field>
|
||||
<!-- San Francisco store -->
|
||||
<field name="store">37.7752,-122.4232</field>
|
||||
<field name="manufacturedate_dt">2006-02-14T23:55:59Z</field>
|
||||
</doc>
|
||||
|
||||
|
||||
</add>
|
||||
|
||||
|
||||
|
||||
40
solr/example/exampledocs/ipod_video.xml
Normal file
40
solr/example/exampledocs/ipod_video.xml
Normal file
@@ -0,0 +1,40 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add><doc>
|
||||
<field name="id">MA147LL/A</field>
|
||||
<field name="name">Apple 60 GB iPod with Video Playback Black</field>
|
||||
<field name="manu">Apple Computer Inc.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">apple</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">music</field>
|
||||
<field name="features">iTunes, Podcasts, Audiobooks</field>
|
||||
<field name="features">Stores up to 15,000 songs, 25,000 photos, or 150 hours of video</field>
|
||||
<field name="features">2.5-inch, 320x240 color TFT LCD display with LED backlight</field>
|
||||
<field name="features">Up to 20 hours of battery life</field>
|
||||
<field name="features">Plays AAC, MP3, WAV, AIFF, Audible, Apple Lossless, H.264 video</field>
|
||||
<field name="features">Notes, Calendar, Phone book, Hold button, Date display, Photo wallet, Built-in games, JPEG photo playback, Upgradeable firmware, USB 2.0 compatibility, Playback speed control, Rechargeable capability, Battery level indication</field>
|
||||
<field name="includes">earbud headphones, USB cable</field>
|
||||
<field name="weight">5.5</field>
|
||||
<field name="price">399.00</field>
|
||||
<field name="popularity">10</field>
|
||||
<field name="inStock">true</field>
|
||||
<!-- Dodge City store -->
|
||||
<field name="store">37.7752,-100.0232</field>
|
||||
<field name="manufacturedate_dt">2005-10-12T08:00:00Z</field>
|
||||
</doc></add>
|
||||
75
solr/example/exampledocs/manufacturers.xml
Normal file
75
solr/example/exampledocs/manufacturers.xml
Normal file
@@ -0,0 +1,75 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add>
|
||||
<doc>
|
||||
<field name="id">adata</field>
|
||||
<field name="compName_s">A-Data Technology</field>
|
||||
<field name="address_s">46221 Landing Parkway Fremont, CA 94538</field>
|
||||
</doc>
|
||||
<doc>
|
||||
<field name="id">apple</field>
|
||||
<field name="compName_s">Apple</field>
|
||||
<field name="address_s">1 Infinite Way, Cupertino CA</field>
|
||||
</doc>
|
||||
<doc>
|
||||
<field name="id">asus</field>
|
||||
<field name="compName_s">ASUS Computer</field>
|
||||
<field name="address_s">800 Corporate Way Fremont, CA 94539</field>
|
||||
</doc>
|
||||
<doc>
|
||||
<field name="id">ati</field>
|
||||
<field name="compName_s">ATI Technologies</field>
|
||||
<field name="address_s">33 Commerce Valley Drive East Thornhill, ON L3T 7N6 Canada</field>
|
||||
</doc>
|
||||
<doc>
|
||||
<field name="id">belkin</field>
|
||||
<field name="compName_s">Belkin</field>
|
||||
<field name="address_s">12045 E. Waterfront Drive Playa Vista, CA 90094</field>
|
||||
</doc>
|
||||
<doc>
|
||||
<field name="id">canon</field>
|
||||
<field name="compName_s">Canon, Inc.</field>
|
||||
<field name="address_s">One Canon Plaza Lake Success, NY 11042</field>
|
||||
</doc>
|
||||
<doc>
|
||||
<field name="id">corsair</field>
|
||||
<field name="compName_s">Corsair Microsystems</field>
|
||||
<field name="address_s">46221 Landing Parkway Fremont, CA 94538</field>
|
||||
</doc>
|
||||
<doc>
|
||||
<field name="id">dell</field>
|
||||
<field name="compName_s">Dell, Inc.</field>
|
||||
<field name="address_s">One Dell Way Round Rock, Texas 78682</field>
|
||||
</doc>
|
||||
<doc>
|
||||
<field name="id">maxtor</field>
|
||||
<field name="compName_s">Maxtor Corporation</field>
|
||||
<field name="address_s">920 Disc Drive Scotts Valley, CA 95066</field>
|
||||
</doc>
|
||||
<doc>
|
||||
<field name="id">samsung</field>
|
||||
<field name="compName_s">Samsung Electronics Co. Ltd.</field>
|
||||
<field name="address_s">105 Challenger Rd. Ridgefield Park, NJ 07660-0511</field>
|
||||
</doc>
|
||||
<doc>
|
||||
<field name="id">viewsonic</field>
|
||||
<field name="compName_s">ViewSonic Corp</field>
|
||||
<field name="address_s">381 Brea Canyon Road Walnut, CA 91789-0708</field>
|
||||
</doc>
|
||||
</add>
|
||||
|
||||
77
solr/example/exampledocs/mem.xml
Normal file
77
solr/example/exampledocs/mem.xml
Normal file
@@ -0,0 +1,77 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add>
|
||||
<doc>
|
||||
<field name="id">TWINX2048-3200PRO</field>
|
||||
<field name="name">CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
|
||||
<field name="manu">Corsair Microsystems Inc.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">corsair</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">memory</field>
|
||||
<field name="features">CAS latency 2, 2-3-3-6 timing, 2.75v, unbuffered, heat-spreader</field>
|
||||
<field name="price">185.00</field>
|
||||
<field name="popularity">5</field>
|
||||
<field name="inStock">true</field>
|
||||
<!-- San Francisco store -->
|
||||
<field name="store">37.7752,-122.4232</field>
|
||||
<field name="manufacturedate_dt">2006-02-13T15:26:37Z</field>
|
||||
|
||||
<!-- a field for testing payload tagged text via DelimitedPayloadTokenFilter -->
|
||||
<field name="payloads">electronics|6.0 memory|3.0</field>
|
||||
</doc>
|
||||
|
||||
<doc>
|
||||
<field name="id">VS1GB400C3</field>
|
||||
<field name="name">CORSAIR ValueSelect 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - Retail</field>
|
||||
<field name="manu">Corsair Microsystems Inc.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">corsair</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">memory</field>
|
||||
<field name="price">74.99</field>
|
||||
<field name="popularity">7</field>
|
||||
<field name="inStock">true</field>
|
||||
<!-- Dodge City store -->
|
||||
<field name="store">37.7752,-100.0232</field>
|
||||
<field name="manufacturedate_dt">2006-02-13T15:26:37Z</field>
|
||||
|
||||
<field name="payloads">electronics|4.0 memory|2.0</field>
|
||||
</doc>
|
||||
|
||||
<doc>
|
||||
<field name="id">VDBDB1A16</field>
|
||||
<field name="name">A-DATA V-Series 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - OEM</field>
|
||||
<field name="manu">A-DATA Technology Inc.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">corsair</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">memory</field>
|
||||
<field name="features">CAS latency 3, 2.7v</field>
|
||||
<!-- note: price & popularity is missing on this one -->
|
||||
<field name="popularity">0</field>
|
||||
<field name="inStock">true</field>
|
||||
<!-- Buffalo store -->
|
||||
<field name="store">45.18414,-93.88141</field>
|
||||
<field name="manufacturedate_dt">2006-02-13T15:26:37Z</field>
|
||||
|
||||
<field name="payloads">electronics|0.9 memory|0.1</field>
|
||||
</doc>
|
||||
|
||||
</add>
|
||||
|
||||
65
solr/example/exampledocs/money.xml
Normal file
65
solr/example/exampledocs/money.xml
Normal file
@@ -0,0 +1,65 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- Example documents utilizing the CurrencyField type -->
|
||||
<add>
|
||||
<doc>
|
||||
<field name="id">USD</field>
|
||||
<field name="name">One Dollar</field>
|
||||
<field name="manu">Bank of America</field>
|
||||
<field name="manu_id_s">boa</field>
|
||||
<field name="cat">currency</field>
|
||||
<field name="features">Coins and notes</field>
|
||||
<field name="price_c">1,USD</field>
|
||||
<field name="inStock">true</field>
|
||||
</doc>
|
||||
|
||||
<doc>
|
||||
<field name="id">EUR</field>
|
||||
<field name="name">One Euro</field>
|
||||
<field name="manu">European Union</field>
|
||||
<field name="manu_id_s">eu</field>
|
||||
<field name="cat">currency</field>
|
||||
<field name="features">Coins and notes</field>
|
||||
<field name="price_c">1,EUR</field>
|
||||
<field name="inStock">true</field>
|
||||
</doc>
|
||||
|
||||
<doc>
|
||||
<field name="id">GBP</field>
|
||||
<field name="name">One British Pound</field>
|
||||
<field name="manu">U.K.</field>
|
||||
<field name="manu_id_s">uk</field>
|
||||
<field name="cat">currency</field>
|
||||
<field name="features">Coins and notes</field>
|
||||
<field name="price_c">1,GBP</field>
|
||||
<field name="inStock">true</field>
|
||||
</doc>
|
||||
|
||||
<doc>
|
||||
<field name="id">NOK</field>
|
||||
<field name="name">One Krone</field>
|
||||
<field name="manu">Bank of Norway</field>
|
||||
<field name="manu_id_s">nor</field>
|
||||
<field name="cat">currency</field>
|
||||
<field name="features">Coins and notes</field>
|
||||
<field name="price_c">1,NOK</field>
|
||||
<field name="inStock">true</field>
|
||||
</doc>
|
||||
|
||||
</add>
|
||||
|
||||
34
solr/example/exampledocs/monitor.xml
Normal file
34
solr/example/exampledocs/monitor.xml
Normal file
@@ -0,0 +1,34 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add><doc>
|
||||
<field name="id">3007WFP</field>
|
||||
<field name="name">Dell Widescreen UltraSharp 3007WFP</field>
|
||||
<field name="manu">Dell, Inc.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">dell</field>
|
||||
<field name="cat">electronics and computer1</field>
|
||||
<field name="features">30" TFT active matrix LCD, 2560 x 1600, .25mm dot pitch, 700:1 contrast</field>
|
||||
<field name="includes">USB cable</field>
|
||||
<field name="weight">401.6</field>
|
||||
<field name="price">2199.0</field>
|
||||
<field name="popularity">6</field>
|
||||
<field name="inStock">true</field>
|
||||
<!-- Buffalo store -->
|
||||
<field name="store">43.17614,-90.57341</field>
|
||||
</doc></add>
|
||||
|
||||
33
solr/example/exampledocs/monitor2.xml
Normal file
33
solr/example/exampledocs/monitor2.xml
Normal file
@@ -0,0 +1,33 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add><doc>
|
||||
<field name="id">VA902B</field>
|
||||
<field name="name">ViewSonic VA902B - flat panel display - TFT - 19"</field>
|
||||
<field name="manu">ViewSonic Corp.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">viewsonic</field>
|
||||
<field name="cat">electronics and stuff2</field>
|
||||
<field name="features">19" TFT active matrix LCD, 8ms response time, 1280 x 1024 native resolution</field>
|
||||
<field name="weight">190.4</field>
|
||||
<field name="price">279.95</field>
|
||||
<field name="popularity">6</field>
|
||||
<field name="inStock">true</field>
|
||||
<!-- Buffalo store -->
|
||||
<field name="store">45.18814,-93.88541</field>
|
||||
</doc></add>
|
||||
|
||||
3
solr/example/exampledocs/more_books.jsonl
Normal file
3
solr/example/exampledocs/more_books.jsonl
Normal file
@@ -0,0 +1,3 @@
|
||||
{"id":"0060248025","name":"Falling Up","inStock": true,"author": "Shel Silverstein"}
|
||||
{"id":"0679805273","name":"Oh, The Places You'll Go","inStock": true,"author": "Dr. Seuss"}
|
||||
|
||||
43
solr/example/exampledocs/mp500.xml
Normal file
43
solr/example/exampledocs/mp500.xml
Normal file
@@ -0,0 +1,43 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add><doc>
|
||||
<field name="id">0579B002</field>
|
||||
<field name="name">Canon PIXMA MP500 All-In-One Photo Printer</field>
|
||||
<field name="manu">Canon Inc.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">canon</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">multifunction printer</field>
|
||||
<field name="cat">printer</field>
|
||||
<field name="cat">scanner</field>
|
||||
<field name="cat">copier</field>
|
||||
<field name="features">Multifunction ink-jet color photo printer</field>
|
||||
<field name="features">Flatbed scanner, optical scan resolution of 1,200 x 2,400 dpi</field>
|
||||
<field name="features">2.5" color LCD preview screen</field>
|
||||
<field name="features">Duplex Copying</field>
|
||||
<field name="features">Printing speed up to 29ppm black, 19ppm color</field>
|
||||
<field name="features">Hi-Speed USB</field>
|
||||
<field name="features">memory card: CompactFlash, Micro Drive, SmartMedia, Memory Stick, Memory Stick Pro, SD Card, and MultiMediaCard</field>
|
||||
<field name="weight">352.0</field>
|
||||
<field name="price">179.99</field>
|
||||
<field name="popularity">6</field>
|
||||
<field name="inStock">true</field>
|
||||
<!-- Buffalo store -->
|
||||
<field name="store">45.19214,-93.89941</field>
|
||||
</doc></add>
|
||||
|
||||
BIN
solr/example/exampledocs/post.jar
Normal file
BIN
solr/example/exampledocs/post.jar
Normal file
Binary file not shown.
13
solr/example/exampledocs/sample.html
Normal file
13
solr/example/exampledocs/sample.html
Normal file
@@ -0,0 +1,13 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>Welcome to Solr</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
Here is some text
|
||||
</p>
|
||||
<p>distinct<br/>words</p>
|
||||
<div>Here is some text in a div</div>
|
||||
<div>This has a <a href="http://www.apache.org">link</a>.</div>
|
||||
</body>
|
||||
</html>
|
||||
38
solr/example/exampledocs/sd500.xml
Normal file
38
solr/example/exampledocs/sd500.xml
Normal file
@@ -0,0 +1,38 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add><doc>
|
||||
<field name="id">9885A004</field>
|
||||
<field name="name">Canon PowerShot SD500</field>
|
||||
<field name="manu">Canon Inc.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">canon</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">camera</field>
|
||||
<field name="features">3x zoop, 7.1 megapixel Digital ELPH</field>
|
||||
<field name="features">movie clips up to 640x480 @30 fps</field>
|
||||
<field name="features">2.0" TFT LCD, 118,000 pixels</field>
|
||||
<field name="features">built in flash, red-eye reduction</field>
|
||||
<field name="includes">32MB SD card, USB cable, AV cable, battery</field>
|
||||
<field name="weight">6.4</field>
|
||||
<field name="price">329.95</field>
|
||||
<field name="popularity">7</field>
|
||||
<field name="inStock">true</field>
|
||||
<field name="manufacturedate_dt">2006-02-13T15:26:37Z</field>
|
||||
<!-- Buffalo store -->
|
||||
<field name="store">45.19614,-93.90341</field>
|
||||
</doc></add>
|
||||
BIN
solr/example/exampledocs/solr-word.pdf
Normal file
BIN
solr/example/exampledocs/solr-word.pdf
Normal file
Binary file not shown.
38
solr/example/exampledocs/solr.xml
Normal file
38
solr/example/exampledocs/solr.xml
Normal file
@@ -0,0 +1,38 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add>
|
||||
<doc>
|
||||
<field name="id">SOLR1000</field>
|
||||
<field name="name">Solr, the Enterprise Search Server</field>
|
||||
<field name="manu">Apache Software Foundation</field>
|
||||
<field name="cat">software</field>
|
||||
<field name="cat">search</field>
|
||||
<field name="features">Advanced Full-Text Search Capabilities using Lucene</field>
|
||||
<field name="features">Optimized for High Volume Web Traffic</field>
|
||||
<field name="features">Standards Based Open Interfaces - XML and HTTP</field>
|
||||
<field name="features">Comprehensive HTML Administration Interfaces</field>
|
||||
<field name="features">Scalability - Efficient Replication to other Solr Search Servers</field>
|
||||
<field name="features">Flexible and Adaptable with XML configuration and Schema</field>
|
||||
<field name="features">Good unicode support: héllo (hello with an accent over the e)</field>
|
||||
<field name="price">0.0</field>
|
||||
<field name="popularity">10</field>
|
||||
<field name="inStock">true</field>
|
||||
<field name="incubationdate_dt">2006-01-17T00:00:00.000Z</field>
|
||||
</doc>
|
||||
</add>
|
||||
|
||||
93
solr/example/exampledocs/test_utf8.sh
Executable file
93
solr/example/exampledocs/test_utf8.sh
Executable file
@@ -0,0 +1,93 @@
|
||||
#!/bin/sh
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#Test script to tell if the server is accepting UTF-8
|
||||
#The python writer currently escapes non-ascii chars, so it's good for testing
|
||||
|
||||
SOLR_URL=http://localhost:8983/solr
|
||||
|
||||
if [ ! -z $1 ]; then
|
||||
SOLR_URL=$1
|
||||
fi
|
||||
|
||||
curl "$SOLR_URL/select?q=hello¶ms=explicit&wt=python" 2> /dev/null | grep 'hello' > /dev/null 2>&1
|
||||
if [ $? = 0 ]; then
|
||||
echo "Solr server is up."
|
||||
else
|
||||
echo "ERROR: Could not curl to Solr - is curl installed? Is Solr not running?"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
curl "$SOLR_URL/select?q=h%C3%A9llo&echoParams=explicit&wt=python" 2> /dev/null | grep 'h\\u00e9llo' > /dev/null 2>&1
|
||||
if [ $? = 0 ]; then
|
||||
echo "HTTP GET is accepting UTF-8"
|
||||
else
|
||||
echo "ERROR: HTTP GET is not accepting UTF-8"
|
||||
fi
|
||||
|
||||
curl $SOLR_URL/select --data-binary 'q=h%C3%A9llo&echoParams=explicit&wt=python' -H 'Content-type:application/x-www-form-urlencoded; charset=UTF-8' 2> /dev/null | grep 'h\\u00e9llo' > /dev/null 2>&1
|
||||
if [ $? = 0 ]; then
|
||||
echo "HTTP POST is accepting UTF-8"
|
||||
else
|
||||
echo "ERROR: HTTP POST is not accepting UTF-8"
|
||||
fi
|
||||
|
||||
curl $SOLR_URL/select --data-binary 'q=h%C3%A9llo&echoParams=explicit&wt=python' 2> /dev/null | grep 'h\\u00e9llo' > /dev/null 2>&1
|
||||
if [ $? = 0 ]; then
|
||||
echo "HTTP POST defaults to UTF-8"
|
||||
else
|
||||
echo "HTTP POST does not default to UTF-8"
|
||||
fi
|
||||
|
||||
|
||||
#A unicode character outside of the BMP (a circle with an x inside)
|
||||
CHAR="𐌈"
|
||||
CODEPOINT='0x10308'
|
||||
#URL encoded UTF8 of the codepoint
|
||||
UTF8_Q='%F0%90%8C%88'
|
||||
#expected return of the python writer (currently uses UTF-16 surrogates)
|
||||
EXPECTED='\\ud800\\udf08'
|
||||
|
||||
curl "$SOLR_URL/select?q=$UTF8_Q&echoParams=explicit&wt=python" 2> /dev/null | grep $EXPECTED > /dev/null 2>&1
|
||||
if [ $? = 0 ]; then
|
||||
echo "HTTP GET is accepting UTF-8 beyond the basic multilingual plane"
|
||||
else
|
||||
echo "ERROR: HTTP GET is not accepting UTF-8 beyond the basic multilingual plane"
|
||||
fi
|
||||
|
||||
curl $SOLR_URL/select --data-binary "q=$UTF8_Q&echoParams=explicit&wt=python" -H 'Content-type:application/x-www-form-urlencoded; charset=UTF-8' 2> /dev/null | grep $EXPECTED > /dev/null 2>&1
|
||||
if [ $? = 0 ]; then
|
||||
echo "HTTP POST is accepting UTF-8 beyond the basic multilingual plane"
|
||||
else
|
||||
echo "ERROR: HTTP POST is not accepting UTF-8 beyond the basic multilingual plane"
|
||||
fi
|
||||
|
||||
curl "$SOLR_URL/select?q=$UTF8_Q&echoParams=explicit&wt=python" --data-binary '' 2> /dev/null | grep $EXPECTED > /dev/null 2>&1
|
||||
if [ $? = 0 ]; then
|
||||
echo "HTTP POST + URL params is accepting UTF-8 beyond the basic multilingual plane"
|
||||
else
|
||||
echo "ERROR: HTTP POST + URL params is not accepting UTF-8 beyond the basic multilingual plane"
|
||||
fi
|
||||
|
||||
#curl "$SOLR_URL/select?q=$UTF8_Q&echoParams=explicit" 2> /dev/null | od -tx1 -w1000 | sed 's/ //g' | grep 'f4808198' > /dev/null 2>&1
|
||||
curl "$SOLR_URL/select?q=$UTF8_Q&echoParams=explicit" 2> /dev/null | grep "$CHAR" > /dev/null 2>&1
|
||||
if [ $? = 0 ]; then
|
||||
echo "Response correctly returns UTF-8 beyond the basic multilingual plane"
|
||||
else
|
||||
echo "ERROR: Response can't return UTF-8 beyond the basic multilingual plane"
|
||||
fi
|
||||
|
||||
|
||||
41
solr/example/exampledocs/utf8-example.xml
Normal file
41
solr/example/exampledocs/utf8-example.xml
Normal file
@@ -0,0 +1,41 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
After posting this to Solr with bin/solr post, searching for "êâîôû" from
|
||||
the solr/admin/ search page must return this document.
|
||||
-->
|
||||
|
||||
<add>
|
||||
<doc>
|
||||
<field name="id">UTF8TEST</field>
|
||||
<field name="name">Test with some UTF-8 encoded characters</field>
|
||||
<field name="manu">Apache Software Foundation</field>
|
||||
<field name="cat">software</field>
|
||||
<field name="cat">search</field>
|
||||
<field name="features">No accents here</field>
|
||||
<field name="features">This is an e acute: é</field>
|
||||
<field name="features">eaiou with circumflexes: êâîôû</field>
|
||||
<field name="features">eaiou with umlauts: ëäïöü</field>
|
||||
<field name="features">tag with escaped chars: <nicetag/></field>
|
||||
<field name="features">escaped ampersand: Bonnie & Clyde</field>
|
||||
<field name="features">Outside the BMP:𐌈 codepoint=10308, a circle with an x inside. UTF8=f0908c88 UTF16=d800 df08</field>
|
||||
<field name="price">0.0</field>
|
||||
<field name="inStock">true</field>
|
||||
</doc>
|
||||
</add>
|
||||
62
solr/example/exampledocs/vidcard.xml
Normal file
62
solr/example/exampledocs/vidcard.xml
Normal file
@@ -0,0 +1,62 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<add>
|
||||
<doc>
|
||||
<field name="id">EN7800GTX/2DHTV/256M</field>
|
||||
<field name="name">ASUS Extreme N7800GTX/2DHTV (256 MB)</field>
|
||||
<!-- Denormalized -->
|
||||
<field name="manu">ASUS Computer Inc.</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">asus</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">graphics card</field>
|
||||
<field name="features">NVIDIA GeForce 7800 GTX GPU/VPU clocked at 486MHz</field>
|
||||
<field name="features">256MB GDDR3 Memory clocked at 1.35GHz</field>
|
||||
<field name="features">PCI Express x16</field>
|
||||
<field name="features">Dual DVI connectors, HDTV out, video input</field>
|
||||
<field name="features">OpenGL 2.0, DirectX 9.0</field>
|
||||
<field name="weight">16.0</field>
|
||||
<field name="price">479.95</field>
|
||||
<field name="popularity">7</field>
|
||||
<field name="store">40.7143,-74.006</field>
|
||||
<field name="inStock">false</field>
|
||||
<field name="manufacturedate_dt">2006-02-13T15:26:37Z/DAY</field>
|
||||
</doc>
|
||||
<!-- yes, you can add more than one document at a time -->
|
||||
<doc>
|
||||
<field name="id">100-435805</field>
|
||||
<field name="name">ATI Radeon X1900 XTX 512 MB PCIE Video Card</field>
|
||||
<field name="manu">ATI Technologies</field>
|
||||
<!-- Join -->
|
||||
<field name="manu_id_s">ati</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">graphics card</field>
|
||||
<field name="features">ATI RADEON X1900 GPU/VPU clocked at 650MHz</field>
|
||||
<field name="features">512MB GDDR3 SDRAM clocked at 1.55GHz</field>
|
||||
<field name="features">PCI Express x16</field>
|
||||
<field name="features">dual DVI, HDTV, svideo, composite out</field>
|
||||
<field name="features">OpenGL 2.0, DirectX 9.0</field>
|
||||
<field name="weight">48.0</field>
|
||||
<field name="price">649.99</field>
|
||||
<field name="popularity">7</field>
|
||||
<field name="inStock">false</field>
|
||||
<field name="manufacturedate_dt">2006-02-13T15:26:37Z/DAY</field>
|
||||
<!-- NYC store -->
|
||||
<field name="store">40.7143,-74.006</field>
|
||||
</doc>
|
||||
</add>
|
||||
18
solr/example/films/README.md
Normal file
18
solr/example/films/README.md
Normal file
@@ -0,0 +1,18 @@
|
||||
We have a movie data set in JSON, Solr XML, and CSV formats. All 3 formats contain the same data. You can use any one format to index documents to Solr.
|
||||
|
||||
This example uses the `_default` configset that ships with Solr plus some custom fields added via Schema API. It demonstrates the use of ParamSets in conjunction with the [Request Parameters API](https://solr.apache.org/guide/solr/latest/configuration-guide/request-parameters-api.html).
|
||||
|
||||
The original data was fetched from Freebase and the data license is present in the films-LICENSE.txt file. Freebase was shutdown in 2016 by Google.
|
||||
|
||||
This data consists of the following fields:
|
||||
* `id` - unique identifier for the movie
|
||||
* `name` - Name of the movie
|
||||
* `directed_by` - The person(s) who directed the making of the film
|
||||
* `initial_release_date` - The earliest official initial film screening date in any country
|
||||
* `genre` - The genre(s) that the movie belongs to
|
||||
* `film_vector` - The 10 dimensional vector representing the film, according to a toy example embedding model
|
||||
|
||||
The `name` and `initial_release_date` are created via the Schema API, and the `genre` and `direct_by` fields
|
||||
are created by the use of an Update Request Processor Chain called `add-unknown-fields-to-the-schema`.
|
||||
|
||||
The `film_vector` is an embedding vector created to represent the movie with 10 dimensions. The vector is created from a BERT pre-trained model, followed by a dimension reduction technique to reduce the embeddings from 768 to 10 dimensions. Even though it is expected that similar movies will be close to each other, this model is just a "toy example", so it's not guaranteed to be a good representation for the movies. The Python scripts utilized to create the model and calculate the films vectors are in the [vectors directory](./vectors).
|
||||
3
solr/example/films/films-LICENSE.txt
Normal file
3
solr/example/films/films-LICENSE.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
The films data (films.json/.xml/.csv) is licensed under the Creative Commons Attribution 2.5 Generic License.
|
||||
To view a copy of this license, visit http://creativecommons.org/licenses/by/2.5/
|
||||
or send a letter to Creative Commons, 444 Castro Street, Suite 900, Mountain View, California, 94041, USA.
|
||||
1101
solr/example/films/films.csv
Normal file
1101
solr/example/films/films.csv
Normal file
File diff suppressed because it is too large
Load Diff
29030
solr/example/films/films.json
Normal file
29030
solr/example/films/films.json
Normal file
File diff suppressed because it is too large
Load Diff
22438
solr/example/films/films.xml
Normal file
22438
solr/example/films/films.xml
Normal file
File diff suppressed because it is too large
Load Diff
53
solr/example/films/vectors/README.md
Normal file
53
solr/example/films/vectors/README.md
Normal file
@@ -0,0 +1,53 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
We present in this directory the Python scripts that were used to create the `film_vector` field for the films dataset.
|
||||
|
||||
- [films.py](./films.py): define some general purpose functions to read, save and process the films dataset.
|
||||
- [create_model.py](./create_model.py): creates an embedding model to represent the films.
|
||||
- [create_dataset.py](./create_dataset.py): uses the embedding model to calculate the vectors of the films and create the new dataset with the extra `film_vector` field.
|
||||
|
||||
To replicate the example you have to run the `create_model.py` script first, followed by `create_dataset.py`. We will describe and discuss each of these scripts below.
|
||||
|
||||
## Setup
|
||||
|
||||
```
|
||||
pip install sentence-transformers
|
||||
```
|
||||
|
||||
## Creating the Model (`create_model.py`)
|
||||
|
||||
There are several approaches that one could use to create vectors (embeddings) to represent documents. In the case of our example we decided to use a _textual_ approach, where we use the text of the document as input for calculating its vector.
|
||||
|
||||
To create the "sentence" that will serve as textual input for the movies we get its title followed by the genres separated in comma. For example, the "8 Mile" movie will have this sentence:
|
||||
```
|
||||
8 Mile
|
||||
|
||||
Musical, Hip hop film, Drama, Musical Drama
|
||||
```
|
||||
|
||||
We use a pretrained model from [SentenceTransformers](https://www.sbert.net/) framework (`all-mpnet-base-v2`) as base for creating a new tailored reduced model. We calculate the 768-dimensions vectors for the sentences of all the movies in the dataset, then run a PCA to extract the 10 most important dimensions. With the PCA result we create a new model that will create vectors of size 10. The number of dimensions is a compromise between performance and quality, and we choose 10 here just to serve as a small and compact example. Generally the higher the number of dimensions, the higher the quality, while also increasing the memory consumption and the computational time to manipulate the vectors.
|
||||
|
||||
This model is created to serve as a small example to demonstrate the vectors features of Solr, so it is just one among many possible ways to create vectors for documents. For example, it is possible to _fine-tune_ a pre-trained model using textual data from our context. Another possibility is to train a model that does not even rely on text, but uses coocurrence of documents or items, like item2vec.
|
||||
|
||||
## Calculating Vectors (`create_dataset.py`)
|
||||
|
||||
Once we have the model created and stored we can use it to calculate the vectors of the documents.
|
||||
|
||||
First we load the model (reading it from disk to RAM). Then we read the films dataset and creates the sentences (as previously described in the previous section). Finally, for each sentence we use the model to calculate and encode the film vector according to its "sentence". After having the `film_vector` field added to the dataset, we export and store it in the 3 formats (JSON, XML and CSV).
|
||||
|
||||
So, if we have new movies to be indexed in the collection we have just to replicate the above steps: (1) load the model, (2) create the film sentence, (3) calculate the film vector from its sentence.
|
||||
68
solr/example/films/vectors/create_dataset.py
Normal file
68
solr/example/films/vectors/create_dataset.py
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This script will use the reduced model created by the `create_model`
|
||||
# script to add a new field in the films dataset, which will store the
|
||||
# film vector according to the embedding model.
|
||||
|
||||
import json
|
||||
|
||||
from sentence_transformers import SentenceTransformer, util
|
||||
import torch
|
||||
|
||||
import films
|
||||
|
||||
#### Load the 10-dimensions model
|
||||
model = SentenceTransformer(films.PATH_FILMS_MODEL)
|
||||
|
||||
#### Load the original films dataset
|
||||
films_dataset = films.load_films_dataset()
|
||||
|
||||
#### Use the embedding model to calculate vectors for all movies
|
||||
films_vectors = films.calculate_films_vectors(model, films_dataset)
|
||||
|
||||
#### Visual evaluation of some specific movies
|
||||
|
||||
def most_similar_movie(target_idx, top_k=5):
|
||||
film = films_dataset[target_idx]
|
||||
film_vector = films_vectors[target_idx]
|
||||
|
||||
cos_scores = util.cos_sim(film_vector, films_vectors)[0]
|
||||
top_results = torch.topk(cos_scores, k=top_k)
|
||||
|
||||
print("\n======================\n")
|
||||
print("Film:", films.get_film_sentence(film).replace("\n", " - "))
|
||||
print("\nTop 5 most similar films in corpus:")
|
||||
|
||||
for score, idx in zip(top_results[0], top_results[1]):
|
||||
movie_str = films.get_film_sentence(films_dataset[idx]).replace("\n", " - ")
|
||||
print(f" - [{idx}] {movie_str} (Score: {score:.4f})")
|
||||
|
||||
most_similar_movie(200)
|
||||
most_similar_movie(100)
|
||||
most_similar_movie(500)
|
||||
most_similar_movie(911)
|
||||
|
||||
|
||||
#### Create the new films dataset by creating a new field with the embedding vector
|
||||
for idx in range(len(films_dataset)):
|
||||
films_dataset[idx]["film_vector"] = list(films_vectors[idx].astype("float64"))
|
||||
|
||||
#### Export the new films dataset for all formats
|
||||
films.export_films_json(films_dataset)
|
||||
films.export_films_xml(films_dataset)
|
||||
films.export_films_csv(films_dataset)
|
||||
110
solr/example/films/vectors/create_model.py
Normal file
110
solr/example/films/vectors/create_model.py
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# In this example, we reduce the dimensionality of the embeddings of
|
||||
# the SBERT pre-trained model 'all-mpnet-base-v2' from 768 to 10 dimensions.
|
||||
#
|
||||
# The code is derived from the SBERT documentation and corresponding example code:
|
||||
# - https://www.sbert.net/examples/training/distillation/README.html
|
||||
# - https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/distillation/dimensionality_reduction.py
|
||||
|
||||
from sklearn.decomposition import PCA
|
||||
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import gzip
|
||||
import csv
|
||||
import random
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
import films
|
||||
|
||||
#### Just some code to print debug information to stdout
|
||||
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()])
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#### Create folders structure
|
||||
pathlib.Path("./data/").mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path("./models/").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
######## Load full model ########
|
||||
|
||||
# Model for which we apply dimensionality reduction
|
||||
model = SentenceTransformer("all-mpnet-base-v2")
|
||||
|
||||
# New size for the embeddings
|
||||
new_dimension = 10
|
||||
|
||||
|
||||
######## Evaluate performance of full model ########
|
||||
|
||||
# We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction
|
||||
sts_dataset_path = "./data/stsbenchmark.tsv.gz"
|
||||
if not os.path.exists(sts_dataset_path):
|
||||
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
|
||||
|
||||
# We measure the performance of the original model
|
||||
# and later we will measure the performance with the reduces dimension size
|
||||
logger.info("Read STSbenchmark test dataset")
|
||||
eval_examples = []
|
||||
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
|
||||
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
|
||||
for row in reader:
|
||||
if row["split"] == "test":
|
||||
score = float(row["score"]) / 5.0 #Normalize score to range 0 ... 1
|
||||
eval_examples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
|
||||
|
||||
# Evaluate the original model on the STS benchmark dataset
|
||||
stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name="sts-benchmark-test")
|
||||
|
||||
logger.info("Original model performance:")
|
||||
stsb_evaluator(model)
|
||||
|
||||
|
||||
######## Reduce the embedding dimensions ########
|
||||
|
||||
# We load the films dataset and creates a list of unique sentences utilizing the movie title and the genres
|
||||
films_dataset = films.load_films_dataset()
|
||||
films_sentences = list(set(films.get_films_sentences(films_dataset)))
|
||||
random.shuffle(films_sentences)
|
||||
|
||||
# To determine the PCA matrix, we need some example sentence embeddings.
|
||||
# Here, we compute the embeddings for all the movies in the films dataset.
|
||||
pca_train_sentences = films_sentences
|
||||
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)
|
||||
|
||||
# Compute PCA on the train embeddings matrix
|
||||
pca = PCA(n_components=new_dimension)
|
||||
pca.fit(train_embeddings)
|
||||
pca_comp = np.asarray(pca.components_)
|
||||
|
||||
# We add a dense layer to the model, so that it will produce directly embeddings with the new size
|
||||
dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity())
|
||||
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
|
||||
model.add_module("dense", dense)
|
||||
|
||||
|
||||
######## Evaluate the model with the reduce embedding size
|
||||
logger.info("Model with {} dimensions:".format(new_dimension))
|
||||
stsb_evaluator(model)
|
||||
|
||||
|
||||
######## Store the reduced model on disc
|
||||
model.save(films.PATH_FILMS_MODEL)
|
||||
92
solr/example/films/vectors/films.py
Normal file
92
solr/example/films/vectors/films.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import csv
|
||||
from lxml import etree
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
PATH_FILMS_DATASET = "../films.json"
|
||||
PATH_FILMS_MODEL = "./models/films-model-size_10"
|
||||
PATH_FILMS_VECTORS_JSON = "./data/films-vectors.json"
|
||||
PATH_FILMS_VECTORS_XML = "./data/films-vectors.xml"
|
||||
PATH_FILMS_VECTORS_CSV = "./data/films-vectors.csv"
|
||||
|
||||
def load_films_dataset():
|
||||
with open(PATH_FILMS_DATASET, "r") as infile:
|
||||
films_dataset = json.load(infile)
|
||||
return films_dataset
|
||||
|
||||
def get_film_sentence(film):
|
||||
return f"{film['name']}\n\n{', '.join(film['genre'])}"
|
||||
|
||||
def get_films_sentences(films_dataset):
|
||||
return [get_film_sentence(film) for film in films_dataset]
|
||||
|
||||
def load_films_embedding_model():
|
||||
return SentenceTransformer(PATH_FILMS_MODEL)
|
||||
|
||||
def calculate_film_vector(model, film):
|
||||
film_sentence = get_film_sentence(film)
|
||||
return model.encode(film_sentence)
|
||||
|
||||
def calculate_films_vectors(model, films_dataset):
|
||||
films_sentences = get_films_sentences(films_dataset)
|
||||
return model.encode(films_sentences)
|
||||
|
||||
def export_films_json(films_dataset):
|
||||
with open(PATH_FILMS_VECTORS_JSON, "w") as outfile:
|
||||
json.dump(films_dataset, outfile, indent=2)
|
||||
|
||||
|
||||
def export_films_xml(films_dataset):
|
||||
|
||||
films_xml = etree.Element("add")
|
||||
for film in films_dataset:
|
||||
|
||||
film_xml = etree.Element("doc")
|
||||
|
||||
for field_name, field_value in film.items():
|
||||
|
||||
field_value = film[field_name]
|
||||
if not isinstance(field_value, list):
|
||||
field_value = [field_value]
|
||||
|
||||
for value in field_value:
|
||||
child = etree.Element("field", attrib={"name": field_name})
|
||||
child.text = str(value)
|
||||
film_xml.append(child)
|
||||
|
||||
films_xml.append(film_xml)
|
||||
|
||||
etree.ElementTree(films_xml).write(
|
||||
PATH_FILMS_VECTORS_XML,
|
||||
pretty_print=True,
|
||||
xml_declaration=True,
|
||||
encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def export_films_csv(films_dataset):
|
||||
with open(PATH_FILMS_VECTORS_CSV, "w") as outfile:
|
||||
csvw = csv.DictWriter(outfile, ["name","directed_by","genre","type","id","initial_release_date","film_vector"])
|
||||
csvw.writeheader()
|
||||
for film in films_dataset:
|
||||
film["directed_by"] = "|".join(film["directed_by"])
|
||||
film["genre"] = "|".join(film["genre"])
|
||||
film["film_vector"] = "|".join(map(str, film["film_vector"]))
|
||||
csvw.writerow(film)
|
||||
Reference in New Issue
Block a user