diff options
64 files changed, 8070 insertions, 0 deletions
diff --git a/jstests/fts1.js b/jstests/fts1.js new file mode 100644 index 00000000000..18d300c8a23 --- /dev/null +++ b/jstests/fts1.js @@ -0,0 +1,18 @@ + +load( "jstests/libs/fts.js" ); + +t = db.text1; +t.drop(); + +t.save( { _id : 1 , x : "az b c" } ); +t.save( { _id : 2 , x : "az b" } ); +t.save( { _id : 3 , x : "b c" } ); +t.save( { _id : 4 , x : "b c d" } ); + +t.ensureIndex( { x : "text" } ); + +assert.eq( [1,2,3,4] , queryIDS( t , "c az" ) , "A1" ); +assert.eq( [4] , queryIDS( t , "d" ) , "A2" ); + + + diff --git a/jstests/fts2.js b/jstests/fts2.js new file mode 100644 index 00000000000..297f585061f --- /dev/null +++ b/jstests/fts2.js @@ -0,0 +1,23 @@ + +load( "jstests/libs/fts.js" ); + +t = db.text2; +t.drop(); + +t.save( { _id : 1 , x : "az b x" , y : "c d m" , z : 1 } ); +t.save( { _id : 2 , x : "c d y" , y : "az b n" , z : 2 } ); + +t.ensureIndex( { x : "text" } , { weights : { x : 10 , y : 1 } } ); + +assert.eq( [1,2] , queryIDS( t , "az" ) , "A1" ); +assert.eq( [2,1] , queryIDS( t , "d" ) , "A2" ); + +assert.eq( [1] , queryIDS( t , "x" ) , "A3" ); +assert.eq( [2] , queryIDS( t , "y" ) , "A4" ); + +assert.eq( [1] , queryIDS( t , "az" , { z : 1 } ) , "B1" ); +assert.eq( [1] , queryIDS( t , "d" , { z : 1 } ) , "B2" ); + +assert.eq( 2 , lastCommadResult.stats.nscannedObjects , "B3" ); +assert.eq( 2 , lastCommadResult.stats.nscanned , "B4" ); + diff --git a/jstests/fts3.js b/jstests/fts3.js new file mode 100644 index 00000000000..30e0026825a --- /dev/null +++ b/jstests/fts3.js @@ -0,0 +1,23 @@ + +load( "jstests/libs/fts.js" ); + +t = db.text3; +t.drop(); + +t.save( { _id : 1 , x : "az b x" , y : "c d m" , z : 1 } ); +t.save( { _id : 2 , x : "c d y" , y : "az b n" , z : 2 } ); + +t.ensureIndex( { x : "text" , z : 1 } , { weights : { x : 10 , y : 1 } } ); + +assert.eq( [1,2] , queryIDS( t , "az" ) , "A1" ); +assert.eq( [2,1] , queryIDS( t , "d" ) , "A2" ); + +assert.eq( [1] , queryIDS( t , "x" ) , "A3" ); +assert.eq( [2] , queryIDS( t , "y" ) , "A4" ); + +assert.eq( [1] , queryIDS( t , "az" , { z : 1 } ) , "B1" ); +assert.eq( [1] , queryIDS( t , "d" , { z : 1 } ) , "B2" ); + +assert.eq( 0 , lastCommadResult.stats.nscannedObjects , "B3" ); +assert.eq( 2 , lastCommadResult.stats.nscanned , "B4" ); + diff --git a/jstests/fts4.js b/jstests/fts4.js new file mode 100644 index 00000000000..ea38cafa756 --- /dev/null +++ b/jstests/fts4.js @@ -0,0 +1,23 @@ + +load( "jstests/libs/fts.js" ); + +t = db.text4; +t.drop(); + +t.save( { _id : 1 , x : [ "az" , "b" , "x" ] , y : [ "c" , "d" , "m" ] , z : 1 } ); +t.save( { _id : 2 , x : [ "c" , "d" , "y" ] , y : [ "az" , "b" , "n" ] , z : 2 } ); + +t.ensureIndex( { y : "text" , z : 1 } , { weights : { x : 10 } } ); + +assert.eq( [1,2] , queryIDS( t , "az" ) , "A1" ); +assert.eq( [2,1] , queryIDS( t , "d" ) , "A2" ); + +assert.eq( [1] , queryIDS( t , "x" ) , "A3" ); +assert.eq( [2] , queryIDS( t , "y" ) , "A4" ); + +assert.eq( [1] , queryIDS( t , "az" , { z : 1 } ) , "B1" ); +assert.eq( [1] , queryIDS( t , "d" , { z : 1 } ) , "B2" ); + +assert.eq( 0 , lastCommadResult.stats.nscannedObjects , "B3" ); +assert.eq( 2 , lastCommadResult.stats.nscanned , "B4" ); + diff --git a/jstests/fts5.js b/jstests/fts5.js new file mode 100644 index 00000000000..d93a307a3ab --- /dev/null +++ b/jstests/fts5.js @@ -0,0 +1,23 @@ + +load( "jstests/libs/fts.js" ); + +t = db.text5; +t.drop(); + +t.save( { _id: 1 , x: [ { a: "az" } , { a: "b" } , { a: "x" } ] , y: [ "c" , "d" , "m" ] , z: 1 } ); +t.save( { _id: 2 , x: [ { a: "c" } , { a: "d" } , { a: "y" } ] , y: [ "az" , "b" , "n" ] , z: 2 } ); + +t.ensureIndex( { y: "text" , z: 1 } , { weights: { "x.a": 10 } } ); + +assert.eq( [1,2] , queryIDS( t , "az" ) , "A1" ); +assert.eq( [2,1] , queryIDS( t , "d" ) , "A2" ); + +assert.eq( [1] , queryIDS( t , "x" ) , "A3" ); +assert.eq( [2] , queryIDS( t , "y" ) , "A4" ); + +assert.eq( [1] , queryIDS( t , "az" , { z: 1 } ) , "B1" ); +assert.eq( [1] , queryIDS( t , "d" , { z: 1 } ) , "B2" ); + +assert.eq( 0 , lastCommadResult.stats.nscannedObjects , "B3" ); +assert.eq( 2 , lastCommadResult.stats.nscanned , "B4" ); + diff --git a/jstests/fts_blog.js b/jstests/fts_blog.js new file mode 100644 index 00000000000..68bdf6c0233 --- /dev/null +++ b/jstests/fts_blog.js @@ -0,0 +1,27 @@ + +t = db.text_blog; +t.drop(); + +t.save( { _id : 1 , title : "my blog post" , text : "this is a new blog i am writing. yay" } ); +t.save( { _id : 2 , title : "my 2nd post" , text : "this is a new blog i am writing. yay" } ); +t.save( { _id : 3 , title : "knives are Fun" , text : "this is a new blog i am writing. yay" } ); + +// default weight is 1 +// specify weights if you want a field to be more meaningull +t.ensureIndex( { "title" : "text" , text : "text" } , { weights : { title : 10 } } ); + +res = t.runCommand( "text" , { search : "blog" } ) +assert.eq( 3, res.results.length ); +assert.eq( 1, res.results[0].obj._id ); + +res = t.runCommand( "text" , { search : "write" } ) +assert.eq( 3, res.results.length ); +assert.eq( res.results[0].score, res.results[1].score ); +assert.eq( res.results[0].score, res.results[2].score ); + + + + + + + diff --git a/jstests/fts_blogwild.js b/jstests/fts_blogwild.js new file mode 100644 index 00000000000..5c00a47983d --- /dev/null +++ b/jstests/fts_blogwild.js @@ -0,0 +1,41 @@ + +t = db.text_blog; +t.drop(); + +t.save( { _id: 1 , title: "my blog post" , text: "this is a new blog i am writing. yay eliot" } ); +t.save( { _id: 2 , title: "my 2nd post" , text: "this is a new blog i am writing. yay" } ); +t.save( { _id: 3 , title: "knives are Fun for writing eliot" , text: "this is a new blog i am writing. yay" } ); + +// default weight is 1 +// specify weights if you want a field to be more meaningull +t.ensureIndex( { dummy: "text" } , { weights: "$**" } ); + +res = t.runCommand( "text" , { search: "blog" } ); +assert.eq( 3 , res.stats.n , "A1" ); + +res = t.runCommand( "text" , { search: "write" } ); +assert.eq( 3 , res.stats.n , "B1" ); + +// mixing +t.dropIndex( "dummy_text" ); +assert.eq( 1 , t.getIndexKeys().length , "C1" ); +t.ensureIndex( { dummy: "text" } , { weights: { "$**": 1 , title: 2 } } ); + + +res = t.runCommand( "text" , { search: "write" } ); +assert.eq( 3 , res.stats.n , "C2" ); +assert.eq( 3 , res.results[0].obj._id , "C3" ); + +res = t.runCommand( "text" , { search: "blog" } ); +assert.eq( 3 , res.stats.n , "D1" ); +assert.eq( 1 , res.results[0].obj._id , "D2" ); + +res = t.runCommand( "text" , { search: "eliot" } ); +assert.eq( 2 , res.stats.n , "E1" ); +assert.eq( 3 , res.results[0].obj._id , "E2" ); + + + + + + diff --git a/jstests/fts_mix.js b/jstests/fts_mix.js new file mode 100644 index 00000000000..62e4bfb7dd8 --- /dev/null +++ b/jstests/fts_mix.js @@ -0,0 +1,152 @@ + +load( "jstests/libs/fts.js" ); + +// test collection +tc = db.text_mix; +tc.drop(); + +// creation of collection documents +// content generated using wikipedia random article +tc.save( { _id: 1, title: "Olivia Shakespear",text: "Olivia Shakespear (born Olivia Tucker; 17 March 1863 – 3 October 1938) was a British novelist, playwright, and patron of the arts. She wrote six books that are described as \"marriage problem\" novels. Her works sold poorly, sometimes only a few hundred copies. Her last novel, Uncle Hilary, is considered her best. She wrote two plays in collaboration with Florence Farr." } ); +tc.save( { _id: 2, title: "Mahim Bora", text: "Mahim Bora (born 1926) is an Indian writer and educationist from Assam state. He was born at a tea estate of Sonitpur district. He is an M.A. in Assamese literature from Gauhati University and had been a teacher in the Nowgong College for most of his teaching career. He has now retired and lives at Nagaon. Bora spent a good part of his childhood in the culture-rich surroundings of rural Nagaon, where the river Kalong was the life-blood of a community. His impressionable mind was to capture a myriad memories of that childhood, later to find expression in his poems, short stories and novels with humour, irony and pathos woven into their texture. When this river was dammed up, its disturbing effect was on the entire community dependant on nature's bounty." } ); +tc.save( { _id: 3, title: "A break away!", text: "A break away! is an 1891 painting by Australian artist Tom Roberts. The painting depicts a mob of thirsty sheep stampeding towards a dam. A drover on horseback is attempting to turn the mob before they drown or crush each other in their desire to drink. The painting, an \"icon of Australian art\", is part of a series of works by Roberts that \"captures what was an emerging spirit of national identity.\" Roberts painted the work at Corowa. The painting depicts a time of drought, with little grass and the soil kicked up as dust. The work itself is a reflection on the pioneering days of the pastoral industry, which were coming to an end by the 1890s." } ); +tc.save( { _id: 4, title: "Linn-Kristin Riegelhuth Koren", text: "Linn-Kristin Riegelhuth Koren (born 1 August 1984, in Ski) is a Norwegian handballer playing for Larvik HK and the Norwegian national team. She is commonly known as Linka. Outside handball she is a qualified nurse." } ); +tc.save( { _id: 5, title: "Morten Jensen", text: "Morten Jensen (born December 2, 1982 in Lynge) is a Danish athlete. He primarily participates in long jump, 100 metres and 200 metres. He competed at the World Championships in 2005 and 2007, the 2006 World Indoor Championships, the 2006 European Championships, the 2007 World Championships and the 2008 Olympic Games without qualifying for the final round. He was runner-up in the 2010 Finnish Elite Games rankings, just missing out to Levern Spencer for that year's jackpot. He holds the Danish record in both long jump and 100 metres. He also holds the Danish indoor record in the 200 metres. He has been a part of the Sparta teamsine 2005, before then he was a part of FIF Hillerd. His coach was Leif Dahlberg after the 2010 European Championships he change to Lars Nielsen and Anders Miller." } ); +tc.save( { _id: 6, title: "Janet Laurence", text: "Janet Laurence (born 1947) is a Sydney based Australian artist who works in mixed media and installation. Her work has been included in major survey exhibitions, nationally and internationally and is regularly exhibited in Sydney, Melbourne and Japan. Her work explores a relationship to the natural world, often from an architectural context. It extends from the gallery space into the urban fabric, and has been realized in many site specific projects, often involving collaborations with architects, landscape architects and environmental scientists. She has received many grants and awards including a Rockefeller Residency in 1997. Laurence was a Trustee of the Art Gallery of NSW from 1995 to 2005. Laurence was the subject of John Beard's winning entry for the 2007 Archibald Prize." } ); +tc.save( { _id: 7, title: "Glen-Coats Baronets", text: "The Glen-Coats Baronetcy, of Ferguslie Park in the Parish of Abbey in the County of Renfrew, was a title in the Baronetage of the United Kingdom. It was created on 25 June 1894 for Thomas Glen-Coats, Director of the thread-making firm of J. & P. Coats, Ltd, and later Liberal Member of Parliament for Renfrewshire West. Born Thomas Coats, he assumed the additional surname of Glen, which was that of his maternal grandfather. He was succeeded by his son, the second Baronet. He won a gold medal in sailing at the 1908 Summer Olympics. The title became extinct on his death in 1954. Two other members of the Coats family also gained distinction. George Coats, 1st Baron Glentanar, was the younger brother of the first Baronet, while Sir James Coats, 1st Baronet (see Coats Baronets), was the first cousin of the first Baronet." } ); +tc.save( { _id: 8, title: "Grapeleaf Skeletonizer", text: "The Grapeleaf Skeletonizer, Harrisina americana is a moth in the family Zygaenidae. It is widespread in the eastern half of the United States, and commonly noticed defoliating grapes, especially of the Virginia creeper (Parthenocissus quinquefolia). The western grapeleaf skeletonizer, Harrisina brillians is very similar to and slightly larger than H. americana, but their distributions are different. Members of this family all produce hydrogen cyanide, a potent antipredator toxin." } ); +tc.save( { _id: 9, title: "Physics World", text: "Physics World is the membership magazine of the Institute of Physics, one of the largest physical societies in the world. It is an international monthly magazine covering all areas of physics, both pure and applied, and is aimed at physicists in research, industry and education worldwide. It was launched in 1988 by IOP Publishing Ltd and has established itself as one of the world's leading physics magazines. The magazine is sent free to members of the Institute of Physics, who can also access a digital edition of the magazine, although selected articles can be read by anyone for free online. It was redesigned in September 2005 and has an audited circulation of just under 35000. The current editor is Matin Durrani. Also on the team are Dens Milne (associate editor), Michael Banks (news editor), Louise Mayor (features editor) and Margaret Harris (reviews and careers editor). Hamish Johnston is the editor of the magazine's website physicsworld.com and James Dacey is its reporter." } ); +tc.save( { _id: 10, title: "Mallacoota, Victoria", text: "Mallacoota is a small town in the East Gippsland region of Victoria, Australia. At the 2006 census, Mallacoota had a population of 972. At holiday times, particularly Easter and Christmas, the population increases by about 8,000. It is one of the most isolated towns in the state of Victoria, 25 kilometres off the Princes Highway and 523 kilometres (325 mi) from Melbourne. It is 526 kilometres (327 mi) from Sydney, New South Wales. It is halfway between Melbourne and Sydney when travelling via Princes Highway, though that is a long route between Australia's two main cities. It is the last official township on Victoria's east coast before the border with New South Wales. Mallacoota has a regional airport (Mallacoota Airport) YMCO (XMC) consisting of a grassed field for private light planes. It is known for its wild flowers, abalone industry, the inlet estuary consisting of Top Lake and Bottom Lake, and Croajingolong National Park that surround it. It is a popular and beautiful holiday spot for boating, fishing, walking the wilderness coast, swimming, birdwatching, and surfing. The Mallacoota Arts Council runs events throughout each year. Mallacoota Inlet is one of the main villages along the wilderness coast walk from NSW to Victoria, Australia." } ); + +// begin tests + +// -------------------------------------------- INDEXING & WEIGHTING ------------------------------- + +// start with basic index, one item with default weight +tc.ensureIndex( { "title": "text" } ); + +// test the single result case.. +res = tc.runCommand( "text", { search: "Victoria" } ); +assert.eq( 1, res.results.length ); +assert.eq( 10, res.results[0].obj._id ); + +tc.dropIndexes(); + +// now let's see about multiple fields, with specific weighting +tc.ensureIndex( { "title": "text", "text": "text" }, { weights: { "title": 10 } } ); +assert.eq( [9,7,8], queryIDS( tc, "members physics" ) ); + +tc.dropIndexes(); + +// test all-1 weighting with "$**" +tc.ensureIndex( { "$**": "text" } ); +assert.eq( [2,8,7], queryIDS( tc, "family tea estate" ) ); + +tc.dropIndexes(); + +// non-1 weight on "$**" + other weight specified for some field +tc.ensureIndex( { "$**": "text" }, { weights: { "$**": 10, "text": 2 } } ); +assert.eq( [7,5], queryIDS( tc, "Olympic Games gold medal" ) ); + +tc.dropIndexes(); + +// -------------------------------------------- SEARCHING ------------------------------------------ + +// go back to "$**": 1, "title": 10.. and test more specific search functionality! +tc.ensureIndex( { "$**": "text" }, { weights: { "title": 10 } } ); + +// -------------------------------------------- STEMMING ------------------------------------------- + +// tests stemming for basic plural case +res = tc.runCommand( "text", { search: "member" } ); +res2 = tc.runCommand( "text", { search: "members" } ); +assert.eq( getIDS( res ), getIDS( res2 ) ); + +// search for something with potential 's bug. +res = tc.runCommand( "text", { search: "magazine's" } ); +res2 = tc.runCommand( "text", { search: "magazine" } ); +assert.eq( getIDS( res ), getIDS( res2 ) ); + +// -------------------------------------------- LIMIT RESULTS -------------------------------------- + +// ensure limit limits results +assert.eq( [2], queryIDS( tc, "rural river dam", null , { limit : 1 } ) ); + +// ensure top results are the same regardless of limit +// make sure that this uses a case where it wouldn't be otherwise.. +res = tc.runCommand( "text", { search: "united kingdom british princes", limit: 1 } ); +res2 = tc.runCommand( "text", { search: "united kingdom british princes" } ); +assert.eq( 1, res.results.length ); +assert.eq( 4, res2.results.length ); +assert.eq( res.results[0].obj._id, res2.results[0].obj._id ); + +// -------------------------------------------- PROJECTION ----------------------------------------- + +// test projection.. show just title and id +res = tc.runCommand( "text", { search: "Morten Jensen", projection: { title: 1 } } ); +assert.eq( 1, res.results.length ); +assert.eq( 5, res.results[0].obj._id ); +assert.eq( null, res.results[0].obj.text ); +assert.neq( null, res.results[0].obj.title ); +assert.neq( null, res.results[0].obj._id ); + +// test negative projection, ie. show everything but text +res = tc.runCommand( "text", { search: "handball", projection: { text: 0 } } ); +assert.eq( 1, res.results.length ); +assert.eq( 4, res.results[0].obj._id ); +assert.eq( null, res.results[0].obj.text ); +assert.neq( null, res.results[0].obj.title ); +assert.neq( null, res.results[0].obj._id ); + +// test projection only title, no id +res = tc.runCommand( "text", { search: "Mahim Bora", projection: { _id: 0, title: 1 } } ); +assert.eq( 1, res.results.length ); +assert.eq( "Mahim Bora", res.results[0].obj.title ); +assert.eq( null, res.results[0].obj.text ); +assert.neq( null, res.results[0].obj.title ); +assert.eq( null, res.results[0].obj._id ); + +// -------------------------------------------- NEGATION ------------------------------------------- + +// test negation +assert.eq( [8], queryIDS( tc, "United -Kingdom" ) ); +assert.eq( -1, tc.findOne( { _id : 8 } ).text.search(/Kingdom/i) ); + +// test negation edge cases... hyphens, double dash, etc. +assert.eq( [4], queryIDS( tc, "Linn-Kristin" ) ); + +// -------------------------------------------- PHRASE MATCHING ------------------------------------ + +// test exact phrase matching on +assert.eq( [7], queryIDS( tc, "\"Summer Olympics\"" ) ); +assert.neq( -1, tc.findOne( { _id: 7 } ).text.indexOf("Summer Olympics") ); + +// phrasematch with other stuff.. negation, other terms, etc. +assert.eq( [10], queryIDS( tc, "\"wild flowers\" Sydney" ) ); + +assert.eq( [3], queryIDS( tc, "\"industry\" -Melbourne -Physics" ) ); + +// -------------------------------------------- EDGE CASES ----------------------------------------- + +// test empty string +res = tc.runCommand( "text", { search: "" } ); +assert.eq( 0, res.ok ) + +// test string with a space in it +res = tc.runCommand( "text", { search: " " } ); +assert.eq( 0, res.results.length ); + +// -------------------------------------------- FILTERING ------------------------------------------ + +assert.eq( [2], queryIDS( tc, "Mahim" ) ); +assert.eq( [2], queryIDS( tc, "Mahim", { _id: 2 } ) ); +assert.eq( [], queryIDS( tc, "Mahim", { _id: 1 } ) ); +assert.eq( [], queryIDS( tc, "Mahim", { _id: { $gte: 4 } } ) ); +assert.eq( [2], queryIDS( tc, "Mahim", { _id: { $lte: 4 } } ) ); + +// using regex conditional filtering +assert.eq( [9], queryIDS( tc, "members", { title: { $regex: /Phy.*/i } } ) ); + +// ------------------------------------------------------------------------------------------------- + +assert( tc.validate().valid ); diff --git a/jstests/fts_partition1.js b/jstests/fts_partition1.js new file mode 100644 index 00000000000..fb28534c443 --- /dev/null +++ b/jstests/fts_partition1.js @@ -0,0 +1,18 @@ +load( "jstests/libs/fts.js" ) + +t = db.text_parition1; +t.drop(); + +t.insert( { _id : 1 , x : 1 , y : "foo" } ); +t.insert( { _id : 2 , x : 1 , y : "bar" } ); +t.insert( { _id : 3 , x : 2 , y : "foo" } ); +t.insert( { _id : 4 , x : 2 , y : "bar" } ); + +t.ensureIndex( { x : 1, y : "text" } ); + +res = t.runCommand( "text", { search : "foo" } ); +assert.eq( 0, res.ok, tojson(res) ); + +assert.eq( [ 1 ], queryIDS( t, "foo" , { x : 1 } ) ); + + diff --git a/jstests/fts_phrase.js b/jstests/fts_phrase.js new file mode 100644 index 00000000000..0b58bef817e --- /dev/null +++ b/jstests/fts_phrase.js @@ -0,0 +1,25 @@ + +t = db.text_phrase; +t.drop() + +t.save( { _id : 1 , title : "my blog post" , text : "i am writing a blog. yay" } ); +t.save( { _id : 2 , title : "my 2nd post" , text : "this is a new blog i am typing. yay" } ); +t.save( { _id : 3 , title : "knives are Fun" , text : "this is a new blog i am writing. yay" } ); + +t.ensureIndex( { "title" : "text" , text : "text" } , { weights : { title : 10 } } ); + +res = t.runCommand( "text" , { search : "blog write" } ); +assert.eq( 3, res.results.length ); +assert.eq( 1, res.results[0].obj._id ); +assert( res.results[0].score > (res.results[1].score*2), tojson(res) ); + +res = t.runCommand( "text" , { search : "write blog" } ); +assert.eq( 3, res.results.length ); +assert.eq( 1, res.results[0].obj._id ); +assert( res.results[0].score > (res.results[1].score*2), tojson(res) ); + + + + + + diff --git a/jstests/fts_proj.js b/jstests/fts_proj.js new file mode 100644 index 00000000000..9ceec8016be --- /dev/null +++ b/jstests/fts_proj.js @@ -0,0 +1,22 @@ +load( "jstests/libs/fts.js" ); + +t = db.text_proj; +t.drop(); + +t.save( { _id : 1 , x : "a", y: "b", z : "c"}); +t.save( { _id : 2 , x : "d", y: "e", z : "f"}); +t.save( { _id : 3 , x : "a", y: "g", z : "h"}); + +t.ensureIndex( { x : "text"} , { default_language : "none" } ); + +res = t.runCommand("text", {search : "a"}); +assert.eq( 2, res.results.length ); +assert( res.results[0].obj.y, tojson(res) ); + +res = t.runCommand("text", {search : "a", projection: {x: 1}}); +assert.eq( 2, res.results.length ); +assert( !res.results[0].obj.y, tojson(res) ); + + + + diff --git a/jstests/fts_spanish.js b/jstests/fts_spanish.js new file mode 100644 index 00000000000..136eaf17ae1 --- /dev/null +++ b/jstests/fts_spanish.js @@ -0,0 +1,32 @@ + +load( "jstests/libs/fts.js" ); + +t = db.text_spanish; +t.drop(); + +t.save( { _id: 1, title: "mi blog", text: "Este es un blog de prueba" } ); +t.save( { _id: 2, title: "mi segundo post", text: "Este es un blog de prueba" } ); +t.save( { _id: 3, title: "cuchillos son divertidos", text: "este es mi tercer blog stemmed" } ); +t.save( { _id: 4, language: "english", title: "My fourth blog", text: "This stemmed blog is in english" } ); + +// default weight is 1 +// specify weights if you want a field to be more meaningull +t.ensureIndex( { "title": "text", text: "text" }, { weights: { title: 10 }, + default_language: "spanish" } ); + +res = t.runCommand( "text", { search: "blog" } ); +assert.eq( 4, res.results.length ); + +assert.eq( [4], queryIDS( t, "stem" ) ); +assert.eq( [3], queryIDS( t, "stemmed" ) ); +assert.eq( [4], queryIDS( t, "stemmed", null, { language : "english" } ) ); + +assert.eq( [1,2], queryIDS( t, "prueba" ) ); + + + + + + + + diff --git a/jstests/libs/fts.js b/jstests/libs/fts.js new file mode 100644 index 00000000000..1dd181a243d --- /dev/null +++ b/jstests/libs/fts.js @@ -0,0 +1,21 @@ + +// make sure we're enabled +db.adminCommand( { setParameter : "*", textSearchEnabled : true } ); + +function queryIDS( coll, search, filter, extra ){ + var cmd = { search : search } + if ( filter ) + cmd.filter = filter; + if ( extra ) + Object.extend( cmd, extra ); + lastCommadResult = coll.runCommand( "text" , cmd); + + return getIDS( lastCommadResult ); +} + +function getIDS( commandResult ){ + if ( ! ( commandResult && commandResult.results ) ) + return [] + + return commandResult.results.map( function(z){ return z.obj._id; } ) +} diff --git a/src/mongo/db/fts/SConscript b/src/mongo/db/fts/SConscript new file mode 100644 index 00000000000..efc1f10586f --- /dev/null +++ b/src/mongo/db/fts/SConscript @@ -0,0 +1,88 @@ +# -*- mode: python -*- + +Import("env") + +stop_word_lanages = [ + 'danish', + 'dutch', + 'english', + 'finnish', + 'french', + 'german', + 'hungarian', + 'italian', + 'norwegian', + 'portuguese', + 'romanian', + 'russian', + 'spanish', + 'swedish', + 'turkish', +] + +env.Command( [ "stop_words_list.h", "stop_words_list.cpp"], + [ "generate_stop_words.py"] + [ 'stop_words_%s.txt' % x for x in stop_word_lanages ], + "$PYTHON $SOURCES $TARGETS" ) + +# this is not awesome +hack = env.Clone() +hack.StaticLibrary( "stopwords", [ "stop_words_list.cpp" ] ) +if "-O3" in hack["CCFLAGS"]: + hack["CCFLAGS"] = hack["CCFLAGS"].remove( "-O3" ) + +env.StaticLibrary('base', [ + 'fts_index_format.cpp', + 'fts_matcher.cpp', + 'fts_query.cpp', + 'fts_spec.cpp', + 'fts_util.cpp', + 'stemmer.cpp', + 'stop_words.cpp', + 'tokenizer.cpp', + ], LIBDEPS=["stopwords", + "$BUILD_DIR/mongo/base/base", + "$BUILD_DIR/mongo/bson", + "$BUILD_DIR/mongo/platform/platform", + "$BUILD_DIR/third_party/libstemmer_c/stemmer" + ]) + +env.StaticLibrary( 'server_common', [ + 'fts_command.cpp', + 'fts_enabled.cpp' + ] ) + +env.StaticLibrary('ftsmongod', [ + 'fts_command_mongod.cpp', + 'fts_index.cpp', + 'fts_search.cpp', + ], LIBDEPS=["base","server_common"]) + + +env.StaticLibrary('ftsmongos', [ + 'fts_command_mongos.cpp', + ], LIBDEPS=["server_common"]) + + +env.CppUnitTest( "fts_index_format_test", "fts_index_format_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_stop_words_test", "stop_words_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_stemmer_test", "stemmer_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_tokenizer_test", "tokenizer_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_query_test", "fts_query_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_spec_test", "fts_spec_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp", + LIBDEPS=["base"] ) + +env.CppUnitTest( "fts_util_test", "fts_util_test.cpp", + LIBDEPS=["base","$BUILD_DIR/mongo/mongohasher"] ) diff --git a/src/mongo/db/fts/fts_command.cpp b/src/mongo/db/fts/fts_command.cpp new file mode 100644 index 00000000000..0cfdf29f8c6 --- /dev/null +++ b/src/mongo/db/fts/fts_command.cpp @@ -0,0 +1,93 @@ +// fts_command.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <string> +#include <vector> + +#include "mongo/db/fts/fts_command.h" +#include "mongo/db/fts/fts_enabled.h" +#include "mongo/db/fts/fts_search.h" +#include "mongo/db/fts/fts_util.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/timer.h" + +namespace mongo { + + namespace fts { + + using namespace mongoutils; + + FTSCommand ftsCommand; + + FTSCommand::FTSCommand() + : Command( "text" ) { + } + + void FTSCommand::addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::find); + out->push_back(Privilege(parseNs(dbname, cmdObj), actions)); + } + + + bool FTSCommand::run(const string& dbname, + BSONObj& cmdObj, + int options, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + + if ( !isTextSearchEnabled() ) { + errmsg = "text search not enabled"; + return false; + } + + string ns = dbname + "." + cmdObj.firstElement().String(); + + string search = cmdObj["search"].valuestrsafe(); + if ( search.size() == 0 ) { + errmsg = "no search specified"; + return false; + } + + string language = cmdObj["language"].valuestrsafe(); + + int limit = cmdObj["limit"].numberInt(); + if (limit == 0) + limit = 100; + + BSONObj filter; + if ( cmdObj["filter"].isABSONObj() ) + filter = cmdObj["filter"].Obj(); + + BSONObj projection; + if (cmdObj["projection"].isABSONObj()) { + projection = cmdObj["projection"].Obj(); + } + + return _run( dbname, cmdObj, options, + ns, search, language, limit, filter, projection, errmsg, result ); + } + + + } + + +} diff --git a/src/mongo/db/fts/fts_command.h b/src/mongo/db/fts/fts_command.h new file mode 100644 index 00000000000..cbd92758ecb --- /dev/null +++ b/src/mongo/db/fts/fts_command.h @@ -0,0 +1,68 @@ +// fts_command.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <string> +#include <vector> + +#include "mongo/db/commands.h" + +namespace mongo { + + namespace fts { + + class FTSCommand : public Command { + public: + FTSCommand(); + + bool slaveOk() const { return true; } + bool slaveOverrideOk() const { return true; } + + LockType locktype() const; + + void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out); + + + bool run(const string& dbname, + BSONObj& cmdObj, + int options, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl); + + protected: + bool _run( const string& dbName, + BSONObj& cmdObj, + int cmdOptions, + const string& ns, + const string& searchString, + string language, // "" for not-set + int limit, + BSONObj& filter, + BSONObj& projection, + string& errmsg, + BSONObjBuilder& result ); + }; + + } + +} + diff --git a/src/mongo/db/fts/fts_command_mongod.cpp b/src/mongo/db/fts/fts_command_mongod.cpp new file mode 100644 index 00000000000..cd38175c8e5 --- /dev/null +++ b/src/mongo/db/fts/fts_command_mongod.cpp @@ -0,0 +1,159 @@ +// fts_command_mongod.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <algorithm> +#include <string> +#include <vector> + +#include "mongo/db/fts/fts_command.h" +#include "mongo/db/fts/fts_search.h" +#include "mongo/db/fts/fts_util.h" +#include "mongo/db/pdfile.h" +#include "mongo/db/projection.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/timer.h" + +namespace mongo { + + namespace fts { + + Command::LockType FTSCommand::locktype() const { + return READ; + } + + /* + * Runs the command object cmdobj on the db with name dbname and puts result in result. + * @param dbname, name of db + * @param cmdobj, object that contains entire command + * @param options + * @param errmsg, reference to error message + * @param result, reference to builder for result + * @param fromRepl + * @return true if successful, false otherwise + */ + bool FTSCommand::_run(const string& dbname, + BSONObj& cmdObj, + int cmdOptions, + const string& ns, + const string& searchString, + string language, // "" for not-set + int limit, + BSONObj& filter, + BSONObj& projection, + string& errmsg, + BSONObjBuilder& result ) { + + Timer comm; + + scoped_ptr<Projection> pr; + if ( !projection.isEmpty() ) { + pr.reset( new Projection() ); + pr->init( projection ); + } + + // priority queue for results + Results results; + + NamespaceDetails * d = nsdetails( ns.c_str() ); + if ( !d ) { + errmsg = "can't find ns"; + return false; + } + + vector<int> idxMatches; + d->findIndexByType( INDEX_NAME, idxMatches ); + if ( idxMatches.size() == 0 ) { + errmsg = str::stream() << "no text index for: " << ns; + return false; + } + if ( idxMatches.size() > 1 ) { + errmsg = str::stream() << "too many text index for: " << ns; + return false; + } + + const IndexDetails& id = d->idx( idxMatches[0] ); + BSONObj indexPrefix; + + if ( language == "" ) { + FTSIndex* ftsIndex = static_cast<FTSIndex*>(id.getSpec().getType()); + language = ftsIndex->getFtsSpec().defaultLanguage(); + Status s = ftsIndex->getFtsSpec().getIndexPrefix( filter, &indexPrefix ); + if ( !s.isOK() ) { + errmsg = s.toString(); + return false; + } + } + + + FTSQuery query; + if ( !query.parse( searchString, language ).isOK() ) { + errmsg = "can't parse search"; + return false; + } + result.append( "queryDebugString", query.debugString() ); + result.append( "language", language ); + + FTSSearch search( d, id, indexPrefix, query, filter ); + search.go( &results, limit ); + + // grab underlying container inside priority queue + vector<ScoredLocation> r( results.dangerous() ); + + // sort results by score (not always in correct order, especially w.r.t. multiterm) + sort( r.begin(), r.end() ); + + // build the results bson array shown to user + BSONArrayBuilder a( result.subarrayStart( "results" ) ); + + int BSONResultSize = 1024; + + for ( unsigned n = 0; n < r.size(); n++ ) { + BSONObj obj = BSONObj::make(r[n].rec); + BSONObj toSendBack = obj; + + if ( pr ) { + toSendBack = pr->transform(obj); + } + + if ( ( BSONResultSize + toSendBack.objsize() ) >= BSONObjMaxUserSize ) { + break; + } + + BSONObjBuilder x( a.subobjStart() ); + x.append( "score" , r[n].score ); + x.append( "obj", toSendBack ); + + BSONObj xobj = x.done(); + BSONResultSize += xobj.objsize(); + } + + a.done(); + + // returns some stats to the user + BSONObjBuilder bb( result.subobjStart( "stats" ) ); + bb.appendNumber( "nscanned" , search.getKeysLookedAt() ); + bb.appendNumber( "nscannedObjects" , search.getObjLookedAt() ); + bb.appendNumber( "n" , r.size() ); + bb.append( "timeMicros", (int)comm.micros() ); + bb.done(); + + return true; + } + } + +} diff --git a/src/mongo/db/fts/fts_command_mongos.cpp b/src/mongo/db/fts/fts_command_mongos.cpp new file mode 100644 index 00000000000..04cc8a1b808 --- /dev/null +++ b/src/mongo/db/fts/fts_command_mongos.cpp @@ -0,0 +1,129 @@ +// fts_command_mongos.cpp + +/** + * Copyright (C) 2008 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <map> +#include <string> +#include <vector> + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_command.h" +#include "mongo/s/strategy.h" + + +namespace mongo { + namespace fts { + + struct Scored { + Scored( BSONObj full ) + : full( full ) { + score = full["score"].numberDouble(); + } + bool operator<( const Scored& other ) const { + return other.score < score; + } + BSONObj full; + double score; + }; + + + // all grid commands are designed not to lock + Command::LockType FTSCommand::locktype() const { return NONE; } + + bool FTSCommand::_run(const string& dbName, + BSONObj& cmdObj, + int cmdOptions, + const string& ns, + const string& searchString, + string language, // "" for not-set + int limit, + BSONObj& filter, + BSONObj& projection, + string& errmsg, + BSONObjBuilder& result ) { + + Timer timer; + + map<Shard, BSONObj> results; + SHARDED->commandOp( dbName, cmdObj, cmdOptions, ns, filter, results ); + + vector<Scored> all; + long long nscanned = 0; + long long nscannedObjects = 0; + + BSONObjBuilder shardStats; + + for ( map<Shard,BSONObj>::const_iterator i = results.begin(); i != results.end(); ++i ) { + BSONObj r = i->second; + + LOG(2) << "fts result for shard: " << i->first << "\n" << r << endl; + + if ( !r["ok"].trueValue() ) { + errmsg = str::stream() << "failure on shard: " << i->first.toString() + << ": " << r["errmsg"]; + result.append( "rawresult", r ); + return false; + } + + if ( r["stats"].isABSONObj() ) { + BSONObj x = r["stats"].Obj(); + nscanned += x["nscanned"].numberLong(); + nscannedObjects += x["nscannedObjects"].numberLong(); + + shardStats.append( i->first.getName(), x ); + } + + if ( r["results"].isABSONObj() ) { + BSONObjIterator j( r["results"].Obj() ); + while ( j.more() ) { + BSONElement e = j.next(); + all.push_back( Scored(e.Obj()) ); + } + } + } + + sort( all.begin(), all.end() ); + long long n = 0; + { + BSONArrayBuilder arr( result.subarrayStart( "results" ) ); + for ( unsigned i = 0; i < all.size(); i++ ) { + arr.append( all[i].full ); + if ( ++n >= limit ) + break; + } + arr.done(); + } + + { + BSONObjBuilder stats( result.subobjStart( "stats" ) ); + stats.appendNumber( "nscanned", nscanned ); + stats.appendNumber( "nscannedObjects", nscannedObjects ); + stats.appendNumber( "n", n ); + stats.append( "timeMicros", (int)timer.micros() ); + + stats.append( "shards", shardStats.obj() ); + + stats.done(); + } + + return true; + } + + FTSCommand ftsCommandSharded; + } +} diff --git a/src/mongo/db/fts/fts_enabled.cpp b/src/mongo/db/fts/fts_enabled.cpp new file mode 100644 index 00000000000..7a11e394f6a --- /dev/null +++ b/src/mongo/db/fts/fts_enabled.cpp @@ -0,0 +1,28 @@ +// fts_enabled.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/db/server_parameters.h" + +namespace mongo { + namespace fts { + MONGO_EXPORT_SERVER_PARAMETER( textSearchEnabled, bool, false ); + bool isTextSearchEnabled() { + return textSearchEnabled; + } + } +} diff --git a/src/mongo/db/fts/fts_enabled.h b/src/mongo/db/fts/fts_enabled.h new file mode 100644 index 00000000000..d3f733dc49f --- /dev/null +++ b/src/mongo/db/fts/fts_enabled.h @@ -0,0 +1,25 @@ +// fts_enabled.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +namespace mongo { + namespace fts { + bool isTextSearchEnabled(); + } +} diff --git a/src/mongo/db/fts/fts_index.cpp b/src/mongo/db/fts/fts_index.cpp new file mode 100644 index 00000000000..04fafe12a83 --- /dev/null +++ b/src/mongo/db/fts/fts_index.cpp @@ -0,0 +1,96 @@ +// fts_index.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/base/init.h" +#include "mongo/db/client.h" +#include "mongo/db/fts/fts_enabled.h" +#include "mongo/db/fts/fts_index.h" +#include "mongo/db/fts/fts_index_format.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/stringutils.h" +#include "mongo/util/timer.h" + +namespace mongo { + + namespace fts { + + using namespace mongoutils; + + /* + * extrapolates the weights vector + * and extra information from the spec + * @param plugin the index plugin for FTS + * @param spec the index specification + */ + FTSIndex::FTSIndex( const IndexPlugin* plugin, const IndexSpec* spec ) + : IndexType( plugin, spec ), _ftsSpec( spec->info ) { + } + + void FTSIndex::getKeys( const BSONObj& obj, BSONObjSet& keys) const { + FTSIndexFormat::getKeys( _ftsSpec, obj, &keys ); + } + + shared_ptr<Cursor> FTSIndex::newCursor( const BSONObj& query, + const BSONObj& order, + int numWanted ) const { + shared_ptr<Cursor> c; + verify(0); + return c; + } + + + FTSIndexPlugin::FTSIndexPlugin() : IndexPlugin( INDEX_NAME ) {} + + + /* + * Adjusts spec by appending information relative to the + * FTS Index (such as weights, index name, etc) + * @param spec, specification object + * + */ + BSONObj FTSIndexPlugin::adjustIndexSpec( const BSONObj& spec ) const { + StringData desc = cc().desc(); + if ( desc.find( "conn" ) == 0 ) { + // this is to make sure we only complain for users + // if you do get a text index created an a primary + // want it to index on the secondary as well + massert( 16633, "text search not enabled", isTextSearchEnabled() ); + } + return FTSSpec::fixSpec( spec ); + } + + /* + * Generates an FTSIndex with a spec and this plugin + * @param spec, specification to be used + */ + IndexType* FTSIndexPlugin::generate( const IndexSpec* spec ) const { + return new FTSIndex( this, spec ); + } + + + FTSIndexPlugin* ftsPlugin; + MONGO_INITIALIZER(FTSIndexPlugin)(InitializerContext* context) { + ftsPlugin = new FTSIndexPlugin(); + return Status::OK(); + } + + } + +} diff --git a/src/mongo/db/fts/fts_index.h b/src/mongo/db/fts/fts_index.h new file mode 100644 index 00000000000..d9bf8a61b16 --- /dev/null +++ b/src/mongo/db/fts/fts_index.h @@ -0,0 +1,67 @@ +// fts_index.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <map> +#include <vector> + +#include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/fts_util.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/stop_words.h" +#include "mongo/db/fts/tokenizer.h" +#include "mongo/db/index.h" + +namespace mongo { + + namespace fts { + + class FTSIndex : public IndexType { + public: + + // index constructor, called when user enters ensureIndex command with fts flag + FTSIndex(const IndexPlugin *plugin, const IndexSpec* spec); + + void getKeys( const BSONObj& obj, BSONObjSet& keys) const; + + /* newCursor is pure Virtual in IndexType so it has to be redefined in FTSIndex */ + shared_ptr<Cursor> newCursor( const BSONObj& query, + const BSONObj& order, + int numWanted ) const; + + const FTSSpec& getFtsSpec() const { return _ftsSpec; } + + private: + + FTSSpec _ftsSpec; + }; + + + class FTSIndexPlugin : public IndexPlugin { + public: + FTSIndexPlugin(); + + IndexType* generate( const IndexSpec* spec ) const; + + BSONObj adjustIndexSpec( const BSONObj& spec ) const; + + }; + + } //namespace fts +} //namespace mongo diff --git a/src/mongo/db/fts/fts_index_format.cpp b/src/mongo/db/fts/fts_index_format.cpp new file mode 100644 index 00000000000..b39b336d651 --- /dev/null +++ b/src/mongo/db/fts/fts_index_format.cpp @@ -0,0 +1,119 @@ +// fts_index_format.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/base/init.h" +#include "mongo/db/fts/fts_index_format.h" + +namespace mongo { + + namespace fts { + + namespace { + BSONObj nullObj; + BSONElement nullElt; + } + + MONGO_INITIALIZER( FTSIndexFormat )( InitializerContext* context ) { + BSONObjBuilder b; + b.appendNull( "" ); + nullObj = b.obj(); + nullElt = nullObj.firstElement(); + return Status::OK(); + } + + void FTSIndexFormat::getKeys( const FTSSpec& spec, + const BSONObj& obj, + BSONObjSet* keys ) { + + int extraSize = 0; + vector<BSONElement> extrasBefore; + vector<BSONElement> extrasAfter; + + // compute the non FTS key elements + for ( unsigned i = 0; i < spec.numExtraBefore(); i++ ) { + BSONElement e = obj.getFieldDotted(spec.extraBefore(i)); + if ( e.eoo() ) + e = nullElt; + extrasBefore.push_back(e); + extraSize += e.size(); + } + for ( unsigned i = 0; i < spec.numExtraAfter(); i++ ) { + BSONElement e = obj.getFieldDotted(spec.extraAfter(i)); + if ( e.eoo() ) + e = nullElt; + extrasAfter.push_back(e); + extraSize += e.size(); + } + + + TermFrequencyMap term_freqs; + spec.scoreDocument( obj, &term_freqs ); + + // create index keys from raw scores + // only 1 per string + for ( TermFrequencyMap::const_iterator i = term_freqs.begin(); + i != term_freqs.end(); + ++i ) { + + const string& term = i->first; + double weight = i->second; + + // guess the total size of the btree entry based on the size of the weight, term tuple + int guess = + 5 /* bson overhead */ + + 10 /* weight */ + + 8 /* term overhead */ + + term.size() + + extraSize; + + BSONObjBuilder b(guess); // builds a BSON object with guess length. + for ( unsigned k = 0; k < extrasBefore.size(); k++ ) + b.appendAs( extrasBefore[k], "" ); + _appendIndexKey( b, weight, term ); + for ( unsigned k = 0; k < extrasAfter.size(); k++ ) + b.appendAs( extrasAfter[k], "" ); + BSONObj res = b.obj(); + + verify( guess >= res.objsize() ); + + keys->insert( res ); + } + } + + BSONObj FTSIndexFormat::getIndexKey( double weight, + const string& term, + const BSONObj& indexPrefix ) { + BSONObjBuilder b; + + BSONObjIterator i( indexPrefix ); + while ( i.more() ) + b.appendAs( i.next(), "" ); + + _appendIndexKey( b, weight, term ); + return b.obj(); + } + + void FTSIndexFormat::_appendIndexKey( BSONObjBuilder& b, double weight, const string& term ) { + verify( weight >= 0 && weight <= MAX_WEIGHT ); // FTSmaxweight = defined in fts_header + b.append( "", term ); + b.append( "", weight ); + } + } +} diff --git a/src/mongo/db/fts/fts_index_format.h b/src/mongo/db/fts/fts_index_format.h new file mode 100644 index 00000000000..eeb225e756f --- /dev/null +++ b/src/mongo/db/fts/fts_index_format.h @@ -0,0 +1,55 @@ +// fts_index_format.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "mongo/db/fts/fts_spec.h" + +namespace mongo { + + namespace fts { + + class FTSIndexFormat { + public: + + static void getKeys( const FTSSpec& spec, + const BSONObj& document, + BSONObjSet* keys ); + + /* + * Helper method to get return entry from the FTSIndex as a BSONObj + * @param weight, the weight of the term in the entry + * @param term, the string term in the entry + * @param indexPrefix, the fields that go in the index first + */ + static BSONObj getIndexKey( double weight, + const string& term, + const BSONObj& indexPrefix ); + + private: + /* + * Helper method to get return entry from the FTSIndex as a BSONObj + * @param b, reference to the BSONOBjBuilder + * @param weight, the weight of the term in the entry + * @param term, the string term in the entry + */ + static void _appendIndexKey( BSONObjBuilder& b, double weight, const string& term ); + }; + + } +} diff --git a/src/mongo/db/fts/fts_index_format_test.cpp b/src/mongo/db/fts/fts_index_format_test.cpp new file mode 100644 index 00000000000..7b0f5b32f0a --- /dev/null +++ b/src/mongo/db/fts/fts_index_format_test.cpp @@ -0,0 +1,96 @@ +// fts_index_format_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_index_format.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + + namespace fts { + + TEST( FTSIndexFormat, Simple1 ) { + FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" ) ) ) ); + BSONObjSet keys; + FTSIndexFormat::getKeys( spec, BSON( "data" << "cat sat" ), &keys ); + + ASSERT_EQUALS( 2U, keys.size() ); + for ( BSONObjSet::const_iterator i = keys.begin(); i != keys.end(); ++i ) { + BSONObj key = *i; + ASSERT_EQUALS( 2, key.nFields() ); + ASSERT_EQUALS( String, key.firstElement().type() ); + } + } + + TEST( FTSIndexFormat, ExtraBack1 ) { + FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" << + "x" << 1 ) ) ) ); + BSONObjSet keys; + FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys ); + + ASSERT_EQUALS( 1U, keys.size() ); + BSONObj key = *(keys.begin()); + ASSERT_EQUALS( 3, key.nFields() ); + BSONObjIterator i( key ); + ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); + ASSERT( i.next().numberDouble() > 0 ); + ASSERT_EQUALS( 5, i.next().numberInt() ); + } + + /* + TEST( FTSIndexFormat, ExtraBackArray1 ) { + FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "data" << "text" << + "x.y" << 1 ) ) ) ); + BSONObjSet keys; + FTSIndexFormat::getKeys( spec, + BSON( "data" << "cat" << + "x" << BSON_ARRAY( BSON( "y" << 1 ) << + BSON( "y" << 2 ) ) ), + &keys ); + + ASSERT_EQUALS( 1U, keys.size() ); + BSONObj key = *(keys.begin()); + log() << "e: " << key << endl; + ASSERT_EQUALS( 3, key.nFields() ); + BSONObjIterator i( key ); + ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); + ASSERT( i.next().numberDouble() > 0 ); + ASSERT_EQUALS( 5, i.next().numberInt() ); + } + */ + + TEST( FTSIndexFormat, ExtraFront1 ) { + FTSSpec spec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << 1 << + "data" << "text" ) ) ) ); + BSONObjSet keys; + FTSIndexFormat::getKeys( spec, BSON( "data" << "cat" << "x" << 5 ), &keys ); + + ASSERT_EQUALS( 1U, keys.size() ); + BSONObj key = *(keys.begin()); + ASSERT_EQUALS( 3, key.nFields() ); + BSONObjIterator i( key ); + ASSERT_EQUALS( 5, i.next().numberInt() ); + ASSERT_EQUALS( StringData("cat"), i.next().valuestr() ); + ASSERT( i.next().numberDouble() > 0 ); + } + + + } +} diff --git a/src/mongo/db/fts/fts_matcher.cpp b/src/mongo/db/fts/fts_matcher.cpp new file mode 100644 index 00000000000..313fdd5be9e --- /dev/null +++ b/src/mongo/db/fts/fts_matcher.cpp @@ -0,0 +1,247 @@ +// fts_matcher.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_matcher.h" + +namespace mongo { + + namespace fts { + + + FTSMatcher::FTSMatcher( const FTSQuery& query, const FTSSpec& spec ) + : _query( query ), + _spec( spec ), + _stemmer( query.getLanguage() ){ + } + + /* + * Checks if the obj contains any of the negTerms, if so returns true, otherwise false + * @param obj, object to be checked + */ + bool FTSMatcher::hasNegativeTerm(const BSONObj& obj ) const { + // called during search. deals with the case in which we have a term + // flagged for exclusion, i.e. "hello -world" we want to remove all + // results that include "world" + + if ( _query.getNegatedTerms().size() == 0 ) + return false; + + if ( _spec.wildcard() ) { + return _hasNegativeTerm_recurse(obj); + } + + /* otherwise look at fields where weights are defined */ + for ( Weights::const_iterator i = _spec.weights().begin(); + i != _spec.weights().end(); + i++ ) { + const char * leftOverName = i->first.c_str(); + BSONElement e = obj.getFieldDottedOrArray(leftOverName); + + if ( e.type() == Array ) { + BSONObjIterator j( e.Obj() ); + while ( j.more() ) { + BSONElement x = j.next(); + if ( leftOverName[0] && x.isABSONObj() ) + x = x.Obj().getFieldDotted( leftOverName ); + if ( x.type() == String ) + if ( _hasNegativeTerm_string( x.String() ) ) + return true; + } + } + else if ( e.type() == String ) { + if ( _hasNegativeTerm_string( e.String() ) ) + return true; + } + } + return false; + } + + bool FTSMatcher::_hasNegativeTerm_recurse(const BSONObj& obj ) const { + BSONObjIterator j( obj ); + while ( j.more() ) { + BSONElement x = j.next(); + + if ( _spec.languageOverrideField() == x.fieldName()) + continue; + + if (x.type() == String) { + if ( _hasNegativeTerm_string( x.String() ) ) + return true; + } + else if ( x.isABSONObj() ) { + BSONObjIterator k( x.Obj() ); + while ( k.more() ) { + // check if k.next() is a obj/array or not + BSONElement y = k.next(); + if ( y.type() == String ) { + if ( _hasNegativeTerm_string( y.String() ) ) + return true; + } + else if ( y.isABSONObj() ) { + if ( _hasNegativeTerm_recurse( y.Obj() ) ) + return true; + } + } + } + } + return false; + } + + /* + * Checks if any of the negTerms is in the tokenized string + * @param raw, the raw string to be tokenized + */ + bool FTSMatcher::_hasNegativeTerm_string( const string& raw ) const { + + Tokenizer i( _query.getLanguage(), raw ); + while ( i.more() ) { + Token t = i.next(); + if ( t.type != Token::TEXT ) + continue; + string word = tolowerString( _stemmer.stem( t.data ) ); + if ( _query.getNegatedTerms().count( word ) > 0 ) + return true; + } + return false; + } + + + bool FTSMatcher::phrasesMatch( const BSONObj& obj ) const { + for (unsigned i = 0; i < _query.getPhr().size(); i++ ) { + if ( !phraseMatch( _query.getPhr()[i], obj ) ) { + return false; + } + } + + for (unsigned i = 0; i < _query.getNegatedPhr().size(); i++ ) { + if ( phraseMatch( _query.getNegatedPhr()[i], obj ) ) { + return false; + } + } + + return true; + } + + + /** + * Checks if phrase is exactly matched in obj, returns true if so, false otherwise + * @param phrase, the string to be matched + * @param obj, document in the collection to match against + */ + bool FTSMatcher::phraseMatch( const string& phrase, const BSONObj& obj ) const { + + if ( _spec.wildcard() ) { + // case where everything is indexed (all fields) + return _phraseRecurse( phrase, obj ); + } + + for ( Weights::const_iterator i = _spec.weights().begin(); + i != _spec.weights().end(); + ++i ) { + + // figure out what the indexed field is.. ie. is it "field" or "field.subfield" etc. + const char * leftOverName = i->first.c_str(); + BSONElement e = obj.getFieldDottedOrArray(leftOverName); + + if ( e.type() == Array ) { + BSONObjIterator j( e.Obj() ); + while ( j.more() ) { + BSONElement x = j.next(); + + if ( leftOverName[0] && x.isABSONObj() ) + x = x.Obj().getFieldDotted( leftOverName ); + + if ( x.type() == String ) + if ( _phraseMatches( phrase, x.String() ) ) + return true; + } + } + else if ( e.type() == String ) { + if ( _phraseMatches( phrase, e.String() ) ) + return true; + } + } + return false; + } + + + /* + * Recurses over all fields in the obj to match against phrase + * @param phrase, string to be matched + * @param obj, object to matched against + */ + bool FTSMatcher::_phraseRecurse( const string& phrase, const BSONObj& obj ) const { + BSONObjIterator j( obj ); + while ( j.more() ) { + BSONElement x = j.next(); + + if ( _spec.languageOverrideField() == x.fieldName() ) + continue; + + if ( x.type() == String ) { + if ( _phraseMatches( phrase, x.String() ) ) + return true; + } + else if ( x.isABSONObj() ) { + BSONObjIterator k( x.Obj() ); + + while ( k.more() ) { + + BSONElement y = k.next(); + + if ( y.type() == mongo::String ) { + if ( _phraseMatches( phrase, y.String() ) ) + return true; + } + else if ( y.isABSONObj() ) { + if ( _phraseRecurse( phrase, y.Obj() ) ) + return true; + } + } + + } + } + + return false; + } + + + /* + * Looks for phrase in a raw string + * @param phrase, phrase to match + * @param raw, raw string to be parsed + */ + bool FTSMatcher::_phraseMatches( const string& phrase, const string& haystack ) const { +#ifdef _WIN32 + // windows doesn't have strcasestr + // for now, doing something very slow, bu correct + string p = phrase; + string h = haystack; + makeLower( &p ); + makeLower( &h ); + return strstr( h.c_str(), p.c_str() ) > 0; +#else + return strcasestr( haystack.c_str(), phrase.c_str() ) > 0; +#endif + } + + + } +} diff --git a/src/mongo/db/fts/fts_matcher.h b/src/mongo/db/fts/fts_matcher.h new file mode 100644 index 00000000000..c5478d63b78 --- /dev/null +++ b/src/mongo/db/fts/fts_matcher.h @@ -0,0 +1,67 @@ +// fts_matcher.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include "mongo/db/fts/fts_query.h" +#include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/tokenizer.h" + +namespace mongo { + + namespace fts { + + class FTSMatcher { + public: + FTSMatcher( const FTSQuery& query, const FTSSpec& spec ); + + /** + * @return true if obj has a negated term + */ + bool hasNegativeTerm(const BSONObj& obj ) const; + + /** + * @return true if obj is ok by all phrases + * so all full phrases and no negated + */ + bool phrasesMatch( const BSONObj& obj ) const; + + bool phraseMatch( const string& phrase, const BSONObj& obj ) const; + + bool matchesNonTerm( const BSONObj& obj ) const { + return !hasNegativeTerm( obj ) && phrasesMatch( obj ); + } + + private: + bool _hasNegativeTerm_recurse(const BSONObj& obj ) const; + + /** + * @return true if raw has a negated term + */ + bool _hasNegativeTerm_string( const string& raw ) const; + + bool _phraseRecurse( const string& phrase, const BSONObj& obj ) const; + bool _phraseMatches( const string& phrase, const string& haystack ) const; + + FTSQuery _query; + FTSSpec _spec; + Stemmer _stemmer; + }; + + } +} diff --git a/src/mongo/db/fts/fts_matcher_test.cpp b/src/mongo/db/fts/fts_matcher_test.cpp new file mode 100644 index 00000000000..15369980885 --- /dev/null +++ b/src/mongo/db/fts/fts_matcher_test.cpp @@ -0,0 +1,63 @@ +// fts_matcher_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_matcher.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + namespace fts { + + TEST( FTSMatcher, NegWild1 ) { + FTSQuery q; + q.parse( "foo -bar", "english" ); + FTSMatcher m( q, + FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "fts" ) ) ) ) ); + + ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) ); + ASSERT( m.hasNegativeTerm( BSON( "x" << BSON( "y" << "bar" ) ) ) ); + } + + TEST( FTSMatcher, Phrase1 ) { + FTSQuery q; + q.parse( "foo \"table top\"", "english" ); + FTSMatcher m( q, + FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "$**" << "fts" ) ) ) ) ); + + ASSERT( m.phraseMatch( "table top", BSON( "x" << "table top" ) ) ); + ASSERT( m.phraseMatch( "table top", BSON( "x" << " asd table top asd" ) ) ); + ASSERT( !m.phraseMatch( "table top", BSON( "x" << "tablz top" ) ) ); + ASSERT( !m.phraseMatch( "table top", BSON( "x" << " asd tablz top asd" ) ) ); + + ASSERT( m.phrasesMatch( BSON( "x" << "table top" ) ) ); + ASSERT( !m.phrasesMatch( BSON( "x" << "table a top" ) ) ); + + } + + TEST( FTSMatcher, Phrase2 ) { + FTSQuery q; + q.parse( "foo \"table top\"", "english" ); + FTSMatcher m( q, + FTSSpec( FTSSpec::fixSpec( BSON( "key" << BSON( "x" << "fts" ) ) ) ) ); + ASSERT( m.phraseMatch( "table top", + BSON( "x" << BSON_ARRAY( "table top" ) ) ) ); + } + + } +} diff --git a/src/mongo/db/fts/fts_query.cpp b/src/mongo/db/fts/fts_query.cpp new file mode 100644 index 00000000000..0f32ba1afad --- /dev/null +++ b/src/mongo/db/fts/fts_query.cpp @@ -0,0 +1,173 @@ +// fts_query.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_query.h" +#include "mongo/db/fts/tokenizer.h" +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/stringutils.h" + +namespace mongo { + + namespace fts { + + using namespace mongoutils; + + Status FTSQuery::parse(const string& query, const string& language) { + _search = query; + _language = language; + + const StopWords* stopWords = StopWords::getStopWords( language ); + Stemmer stemmer( language ); + + bool inNegation = false; + bool inPhrase = false; + + str::stream phrase; + + Tokenizer i( _language, query ); + while ( i.more() ) { + Token t = i.next(); + + if ( t.type == Token::TEXT ) { + string s = t.data.toString(); + + if ( inPhrase ) { + if ( phrase.ss.len() > 0 ) + phrase << ' '; + phrase << s; + } + + if ( inPhrase && inNegation ) { + // don't add term + } + else { + _addTerm( stopWords, stemmer, s, inNegation ); + } + + if ( inNegation && !inPhrase ) + inNegation = false; + } + else if ( t.type == Token::DELIMITER ) { + char c = t.data[0]; + if ( c == '-' ) { + if ( t.previousWhiteSpace ) + inNegation = true; + } + else if ( c == '"' ) { + if ( inPhrase ) { + // end of a phrase + if ( inNegation ) + _negatedPhrases.push_back( tolowerString( phrase ) ); + else + _phrases.push_back( tolowerString( phrase ) ); + inNegation = false; + inPhrase = false; + } + else { + // start of a phrase + inPhrase = true; + phrase.ss.reset(); + } + } + } + else { + abort(); + } + } + + return Status::OK(); + } + + void FTSQuery::_addTerm( const StopWords* sw, Stemmer& stemmer, const string& term, bool negated ) { + string word = tolowerString( term ); + if ( sw->isStopWord( word ) ) + return; + word = stemmer.stem( word ); + if ( negated ) + _negatedTerms.insert( word ); + else + _terms.push_back( word ); + } + + namespace { + void _debugHelp( stringstream& ss, const set<string>& s, const string& sep ) { + bool first = true; + for ( set<string>::const_iterator i = s.begin(); i != s.end(); ++i ) { + if ( first ) + first = false; + else + ss << sep; + ss << *i; + } + } + + void _debugHelp( stringstream& ss, const vector<string>& v, const string& sep ) { + set<string> s( v.begin(), v.end() ); + _debugHelp( ss, s, sep ); + } + + void _debugHelp( stringstream& ss, const unordered_set<string>& v, const string& sep ) { + set<string> s( v.begin(), v.end() ); + _debugHelp( ss, s, sep ); + } + + } + + string FTSQuery::toString() const { + stringstream ss; + ss << "FTSQuery\n"; + + ss << " terms: "; + _debugHelp( ss, getTerms(), ", " ); + ss << "\n"; + + ss << " negated terms: "; + _debugHelp( ss, getNegatedTerms(), ", " ); + ss << "\n"; + + ss << " phrases: "; + _debugHelp( ss, getPhr(), ", " ); + ss << "\n"; + + ss << " negated phrases: "; + _debugHelp( ss, getNegatedPhr(), ", " ); + ss << "\n"; + + return ss.str(); + } + + string FTSQuery::debugString() const { + stringstream ss; + + _debugHelp( ss, getTerms(), "|" ); + ss << "||"; + + _debugHelp( ss, getNegatedTerms(), "|" ); + ss << "||"; + + _debugHelp( ss, getPhr(), "|" ); + ss << "||"; + + _debugHelp( ss, getNegatedPhr(), "|" ); + + return ss.str(); + } + } +} diff --git a/src/mongo/db/fts/fts_query.h b/src/mongo/db/fts/fts_query.h new file mode 100644 index 00000000000..7022760b3a7 --- /dev/null +++ b/src/mongo/db/fts/fts_query.h @@ -0,0 +1,80 @@ +// fts_query.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <string> +#include <vector> + +#include "mongo/base/status.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/stop_words.h" +#include "mongo/platform/unordered_set.h" +#include "mongo/util/stringutils.h" + +namespace mongo { + + namespace fts { + + using std::string; + using std::vector; + using std::set; + + class FTSQuery { + + public: + Status parse(const string& query, const string& language); + + const vector<string>& getTerms() const { return _terms; } + const unordered_set<string>& getNegatedTerms() const { return _negatedTerms; } + + const vector<string>& getPhr() const { return _phrases; } + const vector<string>& getNegatedPhr() const { return _negatedPhrases; } + + /** + * @return true if any negations or phrase + or - + */ + bool hasNonTermPieces() const { + return + _negatedTerms.size() > 0 || + _phrases.size() > 0 || + _negatedPhrases.size() > 0; + } + + string getSearch() const { return _search; } + string getLanguage() const { return _language; } + + string toString() const; + + string debugString() const; + + protected: + string _search; + string _language; + vector<string> _terms; + unordered_set<string> _negatedTerms; + vector<string> _phrases; + vector<string> _negatedPhrases; + + private: + void _addTerm( const StopWords* sw, Stemmer& stemmer, const string& term, bool negated ); + }; + + } +} + diff --git a/src/mongo/db/fts/fts_query_test.cpp b/src/mongo/db/fts/fts_query_test.cpp new file mode 100644 index 00000000000..92bd6ee222a --- /dev/null +++ b/src/mongo/db/fts/fts_query_test.cpp @@ -0,0 +1,73 @@ +// fts_query_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#include "mongo/db/fts/fts_query.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + namespace fts { + + TEST( FTSQuery, Basic1 ) { + FTSQuery q; + ASSERT( q.parse( "this is fun", "english" ).isOK() ); + + ASSERT_EQUALS( 1U, q.getTerms().size() ); + ASSERT_EQUALS( "fun", q.getTerms()[0] ); + ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); + ASSERT_EQUALS( 0U, q.getPhr().size() ); + ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); + } + + TEST( FTSQuery, Neg1 ) { + FTSQuery q; + ASSERT( q.parse( "this is -really fun", "english" ).isOK() ); + + ASSERT_EQUALS( 1U, q.getTerms().size() ); + ASSERT_EQUALS( "fun", q.getTerms()[0] ); + ASSERT_EQUALS( 1U, q.getNegatedTerms().size() ); + ASSERT_EQUALS( "realli", *q.getNegatedTerms().begin() ); + } + + TEST( FTSQuery, Phrase1 ) { + FTSQuery q; + ASSERT( q.parse( "doing a \"phrase test\" for fun", "english" ).isOK() ); + + ASSERT_EQUALS( 3U, q.getTerms().size() ); + ASSERT_EQUALS( 0U, q.getNegatedTerms().size() ); + ASSERT_EQUALS( 1U, q.getPhr().size() ); + ASSERT_EQUALS( 0U, q.getNegatedPhr().size() ); + + ASSERT_EQUALS( "phrase test", q.getPhr()[0] ); + ASSERT_EQUALS( "fun|phrase|test||||phrase test||", q.debugString() ); + } + + TEST( FTSQuery, NegPhrase1 ) { + FTSQuery q; + ASSERT( q.parse( "doing a -\"phrase test\" for fun", "english" ).isOK() ); + ASSERT_EQUALS( "fun||||||phrase test", q.debugString() ); + } + + TEST( FTSQuery, Mix1 ) { + FTSQuery q; + ASSERT( q.parse( "\"industry\" -Melbourne -Physics", "english" ).isOK() ); + ASSERT_EQUALS( "industri||melbourn|physic||industry||", q.debugString() ); + } + + } +} diff --git a/src/mongo/db/fts/fts_search.cpp b/src/mongo/db/fts/fts_search.cpp new file mode 100644 index 00000000000..5686cb89ffb --- /dev/null +++ b/src/mongo/db/fts/fts_search.cpp @@ -0,0 +1,175 @@ +// fts_search.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/btreecursor.h" +#include "mongo/db/fts/fts_index_format.h" +#include "mongo/db/fts/fts_search.h" +#include "mongo/db/kill_current_op.h" +#include "mongo/db/pdfile.h" + +namespace mongo { + + namespace fts { + + /* + * Constructor generates query and term dictionaries + * @param ns, namespace + * @param idxNum, index number + * @param search, query string + * @param language, language of the query + * @param filter, filter object + */ + FTSSearch::FTSSearch( NamespaceDetails* ns, + const IndexDetails& id, + const BSONObj& indexPrefix, + const FTSQuery& query, + const BSONObj& filter ) + : _ns( ns ), + _id( id ), + _fts( static_cast<FTSIndex*>(_id.getSpec().getType()) ), + _indexPrefix( indexPrefix ), + _query( query ), + _ftsMatcher( query, static_cast<FTSIndex*>(_id.getSpec().getType())->getFtsSpec() ) { + + if ( !filter.isEmpty() ) + _matcher.reset( new CoveredIndexMatcher( filter, _fts->keyPattern() ) ); + + _keysLookedAt = 0; + _objectsLookedAt = 0; + } + + bool FTSSearch::_ok( Record* record ) const { + if ( !_query.hasNonTermPieces() ) + return true; + return _ftsMatcher.matchesNonTerm( BSONObj::make( record ) ); + } + + /* + * GO: sets the tree cursors on each term in terms, processes the terms by advancing + * the terms cursors and storing the partial + * results and lastly calculates the top results + * @param results, the priority queue containing the top results + * @param limit, number of results in the priority queue + */ + void FTSSearch::go(Results* results, unsigned limit ) { + vector< shared_ptr<BtreeCursor> > cursors; + + for ( unsigned i = 0; i < _query.getTerms().size(); i++ ) { + const string& term = _query.getTerms()[i]; + BSONObj min = FTSIndexFormat::getIndexKey( MAX_WEIGHT, term, _indexPrefix ); + BSONObj max = FTSIndexFormat::getIndexKey( 0, term, _indexPrefix ); + shared_ptr<BtreeCursor> c( BtreeCursor::make( _ns, _id, min, max, true, -1 ) ); + cursors.push_back( c ); + } + + while ( !inShutdown() ) { + bool gotAny = false; + for ( unsigned i = 0; i < cursors.size(); i++ ) { + if ( cursors[i]->eof() ) + continue; + gotAny = true; + _process( cursors[i].get() ); + cursors[i]->advance(); + } + + if ( !gotAny ) + break; + + RARELY killCurrentOp.checkForInterrupt(); + } + + + // priority queue using a compare that grabs the lowest of two ScoredLocations by score. + for ( Scores::iterator i = _scores.begin(); i != _scores.end(); ++i ) { + + if ( i->second < 0 ) + continue; + + // priority queue + if ( results->size() < limit ) { // case a: queue unfilled + + if ( !_ok( i->first ) ) + continue; + + results->push( ScoredLocation( i->first, i->second ) ); + + } + else if ( i->second > results->top().score ) { // case b: queue filled + + if ( !_ok( i->first ) ) + continue; + + results->pop(); + results->push( ScoredLocation( i->first, i->second ) ); + } + else { + // else do nothing (case c) + } + + } + + } + + /* + * Takes a cursor and updates the partial score for said cursor in _scores map + * @param cursor, btree cursor pointing to the current document to be scored + */ + void FTSSearch::_process( BtreeCursor* cursor ) { + _keysLookedAt++; + + BSONObj key = cursor->currKey(); + + BSONObjIterator i( key ); + BSONElement indexToken = i.next(); + BSONElement scoreElement = i.next(); + + double score = scoreElement.number(); + + double& cur = _scores[(cursor->currLoc()).rec()]; + + if ( cur < 0 ) { + // already been rejected + return; + } + + if ( cur == 0 && _matcher.get() ) { + // we haven't seen this before and we have a matcher + MatchDetails d; + if ( !_matcher->matchesCurrent( cursor, &d ) ) { + cur = -1; + } + + if ( d.hasLoadedRecord() ) + _objectsLookedAt++; + + if ( cur == -1 ) + return; + } + + if ( cur ) + cur += score * (1 + 1 / score); + else + cur += score; + + } + + } + +} diff --git a/src/mongo/db/fts/fts_search.h b/src/mongo/db/fts/fts_search.h new file mode 100644 index 00000000000..82e5b66f3b2 --- /dev/null +++ b/src/mongo/db/fts/fts_search.h @@ -0,0 +1,103 @@ +// fts_search.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <map> +#include <set> +#include <vector> +#include <queue> + +#include "mongo/base/disallow_copying.h" +#include "mongo/db/fts/fts_index.h" +#include "mongo/db/fts/fts_matcher.h" +#include "mongo/db/fts/fts_query.h" +#include "mongo/db/fts/fts_util.h" +#include "mongo/db/matcher.h" + +namespace mongo { + + class BtreeCursor; + + namespace fts { + + // priority queue template, for use when we're populating results + // vector returned to the user. extends the default priority_queue + // by providing direct access to the underlying vector, which should + // be used CAREFULLY because you can get into trouble.. + template <class T, class S, class C> + class a_priority_queue : public std::priority_queue<T, S, C> { + public: + // return the value of an element at position n when we call pq[n] + T operator[](const int &n) { return this->c[n]; } + // return underlying data structure. called dangerous because it is. + S dangerous() { return this->c; } + }; + + typedef a_priority_queue<ScoredLocation, vector<ScoredLocation>, ScoredLocationComp> Results; + + class FTSSearch { + MONGO_DISALLOW_COPYING(FTSSearch); + public: + + typedef std::map<Record*,double> Scores; + + FTSSearch( NamespaceDetails* ns, + const IndexDetails& id, + const BSONObj& indexPrefix, + const FTSQuery& query, + const BSONObj& filter ); + + + void go(Results* results, unsigned limit ); + + const FTSIndex * getIndex() const { return _fts; } + + long long getKeysLookedAt() const { return _keysLookedAt; } + long long getObjLookedAt() const { return _objectsLookedAt; } + + private: + + void _process( BtreeCursor* cursor ); + + /** + * checks not index pieces + * i.e. prhases & negated terms + */ + bool _ok( Record* record ) const; + + NamespaceDetails* _ns; + const IndexDetails& _id; + FTSIndex* _fts; + BSONObj _indexPrefix; + FTSQuery _query; + FTSMatcher _ftsMatcher; + + scoped_ptr<CoveredIndexMatcher> _matcher; + + long long _keysLookedAt; + long long _objectsLookedAt; + + Scores _scores; + + }; + + } // namespace fts + +} // namespace mongo + diff --git a/src/mongo/db/fts/fts_spec.cpp b/src/mongo/db/fts/fts_spec.cpp new file mode 100644 index 00000000000..ab541b6a7f4 --- /dev/null +++ b/src/mongo/db/fts/fts_spec.cpp @@ -0,0 +1,395 @@ +// fts_spec.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_spec.h" +#include "mongo/db/fts/fts_util.h" +#include "mongo/util/mongoutils/str.h" + +namespace mongo { + + namespace fts { + + using namespace mongoutils; + + const double MAX_WEIGHT = 1000000000.0; + + + FTSSpec::FTSSpec( const BSONObj& indexInfo ) { + _defaultLanguage = indexInfo["default_language"].valuestrsafe(); + _languageOverrideField = indexInfo["language_override"].valuestrsafe(); + + if ( _defaultLanguage.size() == 0 ) + _defaultLanguage = "english"; + if ( _languageOverrideField.size() == 0 ) + _languageOverrideField = "language"; + + _wildcard = false; + + // in this block we fill in the _weights map + { + BSONObjIterator i( indexInfo["weights"].Obj() ); + while ( i.more() ) { + BSONElement e = i.next(); + verify( e.isNumber() ); + + if ( WILDCARD == e.fieldName() ) { + _wildcard = true; + } + else { + double num = e.number(); + _weights[ e.fieldName() ] = num; + verify( num > 0 && num < MAX_WEIGHT ); + } + } + verify( _wildcard || _weights.size() ); + } + + // extra information + { + BSONObj keyPattern = indexInfo["key"].Obj(); + verify( keyPattern.nFields() >= 2 ); + BSONObjIterator i( keyPattern ); + + bool passedFTS = false; + + while ( i.more() ) { + BSONElement e = i.next(); + if ( str::equals( e.fieldName(), "_fts" ) || + str::equals( e.fieldName(), "_ftsx" ) ) { + passedFTS = true; + continue; + } + + if ( passedFTS ) + _extraAfter.push_back( e.fieldName() ); + else + _extraBefore.push_back( e.fieldName() ); + } + + } + } + + bool FTSSpec::weight( const StringData& field, double* out ) const { + Weights::const_iterator i = _weights.find( field.toString() ); + if ( i == _weights.end() ) + return false; + *out = i->second; + return true; + } + + string FTSSpec::getLanguageToUse( const BSONObj& userDoc ) const { + BSONElement e = userDoc[_languageOverrideField]; + if ( e.type() == String ) { + const char * x = e.valuestrsafe(); + if ( strlen( x ) > 0 ) + return x; + } + return _defaultLanguage; + } + + + /* + * Calculates the score for all terms in a document of a collection + * @param obj, the document in the collection being parsed + * @param term_freqs, map<string,double> to fill up + */ + void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const { + + string language = getLanguageToUse( obj ); + + Stemmer stemmer(language); + Tools tools(language); + tools.stemmer = &stemmer; + tools.stopwords = StopWords::getStopWords( language ); + + if ( wildcard() ) { + // if * is specified for weight, we can recurse over all fields. + _scoreRecurse(tools, obj, term_freqs); + return; + } + + // otherwise, we need to remember the different weights for each field + // and act accordingly (in other words, call _score) + for ( Weights::const_iterator i = _weights.begin(); i != _weights.end(); i++ ) { + const char * leftOverName = i->first.c_str(); + // name of field + BSONElement e = obj.getFieldDottedOrArray(leftOverName); + // weight associated to name of field + double weight = i->second; + + if ( e.eoo() ) { + // do nothing + } + else if ( e.type() == Array ) { + BSONObjIterator j( e.Obj() ); + while ( j.more() ) { + BSONElement x = j.next(); + if ( leftOverName[0] && x.isABSONObj() ) + x = x.Obj().getFieldDotted( leftOverName ); + if ( x.type() == String ) + _scoreString( tools, x.valuestr(), term_freqs, weight ); + } + } + else if ( e.type() == String ) { + _scoreString( tools, e.valuestr(), term_freqs, weight ); + } + + } + } + + + /* + * Recurses over all fields of an obj (document in collection) + * and fills term,score map term_freqs + * @param tokenizer, tokenizer to tokenize a string into terms + * @param obj, object being parsed + * term_freqs, map <term,score> to be filled up + */ + void FTSSpec::_scoreRecurse(const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs ) const { + BSONObjIterator j( obj ); + while ( j.more() ) { + BSONElement x = j.next(); + + if ( languageOverrideField() == x.fieldName() ) + continue; + + if (x.type() == String) { + double w = 1; + weight( x.fieldName(), &w ); + _scoreString(tools, x.valuestr(), term_freqs, w); + } + else if ( x.isABSONObj() ) { + _scoreRecurse( tools, x.Obj(), term_freqs); + } + + } + } + + namespace { + struct ScoreHelperStruct { + ScoreHelperStruct() + : freq(0), count(0), exp(0){ + } + double freq; + double count; + double exp; + }; + typedef unordered_map<string,ScoreHelperStruct> ScoreHelperMap; + } + + void FTSSpec::_scoreString( const Tools& tools, + const StringData& raw, + TermFrequencyMap* docScores, + double weight ) const { + + ScoreHelperMap terms; + + unsigned numTokens = 0; + + Tokenizer i( tools.language, raw ); + while ( i.more() ) { + Token t = i.next(); + if ( t.type != Token::TEXT ) + continue; + + string term = t.data.toString(); + makeLower( &term ); + term = tools.stemmer->stem( term ); + if ( tools.stopwords->isStopWord( term ) ) + continue; + + ScoreHelperStruct& data = terms[term]; + + if ( data.exp ) + data.exp *= 2; + else + data.exp = 1; + data.count += 1; + data.freq += ( 1 / data.exp ); + + numTokens++; + } + + for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { + + const string& term = i->first; + const ScoreHelperStruct& data = i->second; + + // in order to adjust weights as a function of term count as it + // relates to total field length. ie. is this the only word or + // a frequently occuring term? or does it only show up once in + // a long block of text? + + double coeff = ( 0.5 * data.count / numTokens ) + 0.5; + + // if term is identical to the raw form of the + // field (untokenized) give it a small boost. + double adjustment = 1; + if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) + adjustment += 0.1; + + double& score = (*docScores)[term]; + score += ( weight * data.freq * coeff * adjustment ); + verify( score <= MAX_WEIGHT ); + } + } + + Status FTSSpec::getIndexPrefix( const BSONObj& query, BSONObj* out ) const { + if ( numExtraBefore() == 0 ) { + *out = BSONObj(); + return Status::OK(); + } + + BSONObjBuilder b; + for ( unsigned i = 0; i < numExtraBefore(); i++ ) { + BSONElement e = query.getFieldDotted(extraBefore(i)); + if ( e.eoo() ) + return Status( ErrorCodes::BadValue, + str::stream() + << "need have an eaulity filter on: " + << extraBefore(i) ); + + if ( e.isABSONObj() && e.Obj().firstElement().getGtLtOp( -1 ) != -1 ) + return Status( ErrorCodes::BadValue, + str::stream() + << "need have an eaulity filter on: " + << extraBefore(i) ); + + b.append( e ); + } + *out = b.obj(); + return Status::OK(); + } + + void _addFTSStuff( BSONObjBuilder* b ) { + b->append( "_fts", INDEX_NAME ); + b->append( "_ftsx", 1 ); + } + + BSONObj FTSSpec::fixSpec( const BSONObj& spec ) { + map<string,int> m; + + BSONObj keyPattern; + { + BSONObjBuilder b; + bool addedFtsStuff = false; + + BSONObjIterator i( spec["key"].Obj() ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( str::equals( e.fieldName(), "_fts" ) || + str::equals( e.fieldName(), "_ftsx" ) ) { + continue; + } + else if ( e.type() == String && + ( str::equals( "fts", e.valuestr() ) || + str::equals( "text", e.valuestr() ) ) ) { + + if ( !addedFtsStuff ) { + _addFTSStuff( &b ); + addedFtsStuff = true; + } + + m[e.fieldName()] = 1; + } + else { + b.append( e ); + } + } + + if ( !addedFtsStuff ) + _addFTSStuff( &b ); + + keyPattern = b.obj(); + } + + if ( spec["weights"].isABSONObj() ) { + BSONObjIterator i( spec["weights"].Obj() ); + while ( i.more() ) { + BSONElement e = i.next(); + m[e.fieldName()] = e.numberInt(); + } + } + else if ( spec["weights"].str() == WILDCARD ) { + m[WILDCARD] = 1; + } + + BSONObj weights; + { + BSONObjBuilder b; + for ( map<string,int>::iterator i = m.begin(); i != m.end(); ++i ) + b.append( i->first, i->second ); + weights = b.obj(); + } + + string default_language(spec.getStringField("default_language")); + if ( default_language.empty() ) + default_language = "english"; + + string language_override(spec.getStringField("language_override")); + if ( language_override.empty() ) + language_override = "language"; + + int version = 0; + + BSONObjBuilder b; + BSONObjIterator i( spec ); + while ( i.more() ) { + BSONElement e = i.next(); + if ( str::equals( e.fieldName(), "key" ) ) { + b.append( "key", keyPattern ); + } + else if ( str::equals( e.fieldName(), "weights" ) ) { + b.append( "weights", weights ); + weights = BSONObj(); + } + else if ( str::equals( e.fieldName(), "default_language" ) ) { + b.append( "default_language", default_language); + default_language = ""; + } + else if ( str::equals( e.fieldName(), "language_override" ) ) { + b.append( "language_override", language_override); + language_override = ""; + } + else if ( str::equals( e.fieldName(), "v" ) ) { + version = e.numberInt(); + } + else { + b.append( e ); + } + } + + if ( !weights.isEmpty() ) + b.append( "weights", weights ); + if ( !default_language.empty() ) + b.append( "default_language", default_language); + if ( !language_override.empty() ) + b.append( "language_override", language_override); + + b.append( "v", version ); + + return b.obj(); + + } + + } +} diff --git a/src/mongo/db/fts/fts_spec.h b/src/mongo/db/fts/fts_spec.h new file mode 100644 index 00000000000..e3ebf24f76b --- /dev/null +++ b/src/mongo/db/fts/fts_spec.h @@ -0,0 +1,108 @@ +// fts_spec.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <map> +#include <vector> +#include <string> + +#include "mongo/db/fts/fts_util.h" +#include "mongo/db/fts/stemmer.h" +#include "mongo/db/fts/stop_words.h" +#include "mongo/db/fts/tokenizer.h" +#include "mongo/platform/unordered_map.h" + +namespace mongo { + + namespace fts { + + extern const double MAX_WEIGHT; + + typedef std::map<string,double> Weights; // TODO cool map + + typedef unordered_map<string,double> TermFrequencyMap; + + + class FTSSpec { + + struct Tools { + Tools( string language ) + : language( language ){} + const std::string& language; + const Stemmer* stemmer; + const StopWords* stopwords; + }; + + public: + FTSSpec( const BSONObj& indexInfo ); + + bool wildcard() const { return _wildcard; } + const string& defaultLanguage() const { return _defaultLanguage; } + const string& languageOverrideField() const { return _languageOverrideField; } + + size_t numExtraBefore() const { return _extraBefore.size(); } + const std::string& extraBefore( unsigned i ) const { return _extraBefore[i]; } + + size_t numExtraAfter() const { return _extraAfter.size(); } + const std::string& extraAfter( unsigned i ) const { return _extraAfter[i]; } + + string getLanguageToUse( const BSONObj& userDoc ) const; + + void scoreDocument( const BSONObj& obj, TermFrequencyMap* scores ) const; + + /** + * given a query, pulls out the pieces (in order) that go in the index first + */ + Status getIndexPrefix( const BSONObj& filter, BSONObj* out ) const; + + const Weights& weights() const { return _weights; } + + /** + * @param out - untouched if field isn't present + * @return if field is here + */ + bool weight( const StringData& field, double* out ) const; + + + static BSONObj fixSpec( const BSONObj& spec ); + private: + void _scoreRecurse(const Tools& tools, + const BSONObj& obj, + TermFrequencyMap* term_freqs ) const; + + void _scoreString( const Tools& tools, + const StringData& raw, + TermFrequencyMap* term_freqs, + double weight ) const; + + string _defaultLanguage; + string _languageOverrideField; + bool _wildcard; + + // _weights stores a mapping between the fields and the value as a double + // basically, how much should an occurence of (query term) in (field) be worth + Weights _weights; + + // other fields to index + std::vector<string> _extraBefore; + std::vector<string> _extraAfter; + }; + + } +} diff --git a/src/mongo/db/fts/fts_spec_test.cpp b/src/mongo/db/fts/fts_spec_test.cpp new file mode 100644 index 00000000000..541bd4a56d8 --- /dev/null +++ b/src/mongo/db/fts/fts_spec_test.cpp @@ -0,0 +1,139 @@ +// fts_spec_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/pch.h" + +#include "mongo/db/fts/fts_spec.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + namespace fts { + + TEST( FTSSpec, Fix1 ) { + BSONObj user = BSON( "key" << BSON( "title" << "fts" << + "text" << "fts" ) << + "weights" << BSON( "title" << 10 ) ); + + BSONObj fixed = FTSSpec::fixSpec( user ); + BSONObj fixed2 = FTSSpec::fixSpec( fixed ); + ASSERT_EQUALS( fixed, fixed2 ); + } + + TEST( FTSSpec, ScoreSingleField1 ) { + BSONObj user = BSON( "key" << BSON( "title" << "fts" << + "text" << "fts" ) << + "weights" << BSON( "title" << 10 ) ); + + FTSSpec spec( FTSSpec::fixSpec( user ) ); + + TermFrequencyMap m; + spec.scoreDocument( BSON( "title" << "cat sat run" ), &m ); + ASSERT_EQUALS( 3U, m.size() ); + ASSERT_EQUALS( m["cat"], m["sat"] ); + ASSERT_EQUALS( m["cat"], m["run"] ); + ASSERT( m["cat"] > 0 ); + } + + TEST( FTSSpec, ScoreMultipleField1 ) { + BSONObj user = BSON( "key" << BSON( "title" << "fts" << + "text" << "fts" ) << + "weights" << BSON( "title" << 10 ) ); + + FTSSpec spec( FTSSpec::fixSpec( user ) ); + + TermFrequencyMap m; + spec.scoreDocument( BSON( "title" << "cat sat run" + << "text" << "cat book" ), + &m ); + + ASSERT_EQUALS( 4U, m.size() ); + ASSERT_EQUALS( m["sat"], m["run"] ); + ASSERT( m["sat"] > 0 ); + + ASSERT( m["cat"] > m["sat"] ); + ASSERT( m["cat"] > m["book"] ); + ASSERT( m["book"] > 0 ); + ASSERT( m["book"] < m["sat"] ); + } + + + TEST( FTSSpec, ScoreRepeatWord ) { + BSONObj user = BSON( "key" << BSON( "title" << "fts" << + "text" << "fts" ) << + "weights" << BSON( "title" << 10 ) ); + + FTSSpec spec( FTSSpec::fixSpec( user ) ); + + TermFrequencyMap m; + spec.scoreDocument( BSON( "title" << "cat sat sat run run run" ), &m ); + ASSERT_EQUALS( 3U, m.size() ); + ASSERT( m["cat"] > 0 ); + ASSERT( m["sat"] > m["cat"] ); + ASSERT( m["run"] > m["sat"] ); + + } + + TEST( FTSSpec, Extra1 ) { + BSONObj user = BSON( "key" << BSON( "data" << "fts" ) ); + FTSSpec spec( FTSSpec::fixSpec( user ) ); + ASSERT_EQUALS( 0U, spec.numExtraBefore() ); + ASSERT_EQUALS( 0U, spec.numExtraAfter() ); + } + + TEST( FTSSpec, Extra2 ) { + BSONObj user = BSON( "key" << BSON( "data" << "fts" << "x" << 1 ) ); + FTSSpec spec( FTSSpec::fixSpec( user ) ); + ASSERT_EQUALS( 0U, spec.numExtraBefore() ); + ASSERT_EQUALS( 1U, spec.numExtraAfter() ); + ASSERT_EQUALS( StringData("x"), spec.extraAfter(0) ); + } + + TEST( FTSSpec, Extra3 ) { + BSONObj user = BSON( "key" << BSON( "x" << 1 << "data" << "fts" ) ); + BSONObj fixed = FTSSpec::fixSpec( user ); + + ASSERT_EQUALS( BSON( "x" << 1 << + "_fts" << "text" << + "_ftsx" << 1 ), + fixed["key"].Obj() ); + ASSERT_EQUALS( BSON( "data" << 1 ), + fixed["weights"].Obj() ); + + BSONObj fixed2 = FTSSpec::fixSpec( fixed ); + ASSERT_EQUALS( fixed, fixed2 ); + + FTSSpec spec( fixed ); + ASSERT_EQUALS( 1U, spec.numExtraBefore() ); + ASSERT_EQUALS( StringData("x"), spec.extraBefore(0) ); + ASSERT_EQUALS( 0U, spec.numExtraAfter() ); + + BSONObj prefix; + + ASSERT( spec.getIndexPrefix( BSON( "x" << 2 ), &prefix ).isOK() ); + ASSERT_EQUALS( BSON( "x" << 2 ), prefix ); + + ASSERT( spec.getIndexPrefix( BSON( "x" << 3 << "y" << 4 ), &prefix ).isOK() ); + ASSERT_EQUALS( BSON( "x" << 3 ), prefix ); + + ASSERT( !spec.getIndexPrefix( BSON( "x" << BSON( "$gt" << 5 ) ), &prefix ).isOK() ); + ASSERT( !spec.getIndexPrefix( BSON( "y" << 4 ), &prefix ).isOK() ); + ASSERT( !spec.getIndexPrefix( BSONObj(), &prefix ).isOK() ); + } + + } +} diff --git a/src/mongo/db/fts/fts_util.cpp b/src/mongo/db/fts/fts_util.cpp new file mode 100644 index 00000000000..ace11b67409 --- /dev/null +++ b/src/mongo/db/fts/fts_util.cpp @@ -0,0 +1,30 @@ +// fts_util.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/db/fts/fts_util.h" + +namespace mongo { + + namespace fts { + + const std::string INDEX_NAME = "text"; + const std::string WILDCARD = "$**"; + + } +} + diff --git a/src/mongo/db/fts/fts_util.h b/src/mongo/db/fts/fts_util.h new file mode 100644 index 00000000000..3df5a0c5ee2 --- /dev/null +++ b/src/mongo/db/fts/fts_util.h @@ -0,0 +1,112 @@ +// fts_util.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#pragma once + +#include <string> + +#include "mongo/db/hasher.h" +#include "mongo/db/jsobj.h" +#include "mongo/db/record.h" +#include "mongo/util/unordered_fast_key_table.h" + +namespace mongo { + + namespace fts { + + extern const std::string WILDCARD; + extern const std::string INDEX_NAME; + + /** + * destructive! + */ + inline void makeLower( std::string* s ) { + std::string::size_type sz = s->size(); + for ( std::string::size_type i = 0; i < sz; i++ ) + (*s)[i] = (char)tolower( (int)(*s)[i] ); + } + + /* + * ScoredLocation stores the total score for a document (record *) wrt a search + * + */ + struct ScoredLocation { + ScoredLocation( Record* r, double sc ) + : rec(r), score(sc) { + } + + Record* rec; + double score; + + bool operator<( const ScoredLocation& other ) const { + if ( other.score < score ) + return true; + if ( other.score > score ) + return false; + return rec < other.rec; + } + }; + + // scored location comparison is done based on score + class ScoredLocationComp { + public: + bool operator() (const ScoredLocation& lhs, const ScoredLocation& rhs) const { + return (lhs.score > rhs.score); + } + }; + + struct _be_hash { + size_t operator()( const BSONElement& e ) const { + return static_cast<size_t>( BSONElementHasher::hash64( e, 17 ) ); + } + }; + + struct _be_equals { + bool operator()( const BSONElement& a, const BSONElement& b ) const { + return a == b; + } + }; + + struct _be_convert { + BSONElement operator()( const BSONObj& o ) const { + const BSONElement& x = o.firstElement(); + BSONElement y( x.rawdata() ); + return y; + } + }; + + struct _be_convert_other { + BSONObj operator()( const BSONElement& e ) const { + return e.wrap(); + } + }; + + template< typename V > + class BSONElementMap : public UnorderedFastKeyTable<BSONElement, + BSONObj, + V, + _be_hash, + _be_equals, + _be_convert, + _be_convert_other > { + }; + + + } +} + diff --git a/src/mongo/db/fts/fts_util_test.cpp b/src/mongo/db/fts/fts_util_test.cpp new file mode 100644 index 00000000000..7d959dca08a --- /dev/null +++ b/src/mongo/db/fts/fts_util_test.cpp @@ -0,0 +1,36 @@ +// fts_util_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#include "mongo/unittest/unittest.h" + +#include "mongo/db/fts/fts_util.h" + +namespace mongo { + namespace fts { + + TEST( BSONElementMap, Simple1 ) { + BSONElementMap<double> m; + + BSONObj x = BSON( "x" << 5 ); + m[x.firstElement()] = 5; + ASSERT_EQUALS( 5, m[x.firstElement()] ); + } + + } +} diff --git a/src/mongo/db/fts/generate_stop_words.py b/src/mongo/db/fts/generate_stop_words.py new file mode 100644 index 00000000000..5010fe702a5 --- /dev/null +++ b/src/mongo/db/fts/generate_stop_words.py @@ -0,0 +1,56 @@ +import sys + +def generate( header, source, language_files ): + print( "header: %s" % header ) + print( "source: %s" % source ) + print( "language_files:" ) + for x in language_files: + print( "\t%s" % x ) + + out = open( header, "wb" ) + out.write( """ +#pragma once +#include <map> +#include <set> +#include <string> +namespace mongo { +namespace fts { + + void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ); +} +} +""" ) + out.close() + + + + out = open( source, "wb" ) + out.write( '#include "%s"' % header.rpartition( "/" )[2].rpartition( "\\" )[2] ) + out.write( """ +namespace mongo { +namespace fts { + + void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ) { + +""" ) + + for l_file in language_files: + l = l_file.rpartition( "_" )[2].partition( "." )[0] + + out.write( ' // %s\n' % l_file ) + out.write( ' {\n' ) + out.write( ' std::set< std::string >& l = (*m)["%s"];\n' % l ) + for word in open( l_file, "rb" ): + out.write( ' l.insert( "%s" );\n' % word.strip() ) + out.write( ' }\n' ) + out.write( """ + } +} // namespace fts +} // namespace mongo +""" ) + + +if __name__ == "__main__": + generate( sys.argv[ len(sys.argv) - 2], + sys.argv[ len(sys.argv) - 1], + sys.argv[1:-2] ) diff --git a/src/mongo/db/fts/stemmer.cpp b/src/mongo/db/fts/stemmer.cpp new file mode 100644 index 00000000000..c04d05c87ca --- /dev/null +++ b/src/mongo/db/fts/stemmer.cpp @@ -0,0 +1,58 @@ +// stemmer.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <string> + +#include "mongo/db/fts/stemmer.h" + +namespace mongo { + + namespace fts { + + Stemmer::Stemmer( const string& language ) { + _stemmer = NULL; + if ( language != "none" ) + _stemmer = sb_stemmer_new(language.c_str(), "UTF_8"); + } + + Stemmer::~Stemmer() { + if ( _stemmer ) { + sb_stemmer_delete(_stemmer); + _stemmer = NULL; + } + } + + string Stemmer::stem( const StringData& word ) const { + if ( !_stemmer ) + return word.toString(); + + const sb_symbol* sb_sym = sb_stemmer_stem( _stemmer, + (const sb_symbol*)word.rawData(), + word.size() ); + + if ( sb_sym == NULL ) { + // out of memory + abort(); + } + + return string( (const char*)(sb_sym), sb_stemmer_length( _stemmer ) ); + } + + } + +} diff --git a/src/mongo/db/fts/stemmer.h b/src/mongo/db/fts/stemmer.h new file mode 100644 index 00000000000..d212cc01fce --- /dev/null +++ b/src/mongo/db/fts/stemmer.h @@ -0,0 +1,48 @@ +// stemmer.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#pragma once + +#include <string> + +#include "libstemmer.h" + +#include "mongo/base/string_data.h" + +namespace mongo { + + namespace fts { + + /** + * maintains case + * but works + * running/Running -> run/Run + */ + class Stemmer { + public: + Stemmer( const std::string& language ); + ~Stemmer(); + + std::string stem( const StringData& word ) const; + private: + struct sb_stemmer* _stemmer; + }; + } +} + diff --git a/src/mongo/db/fts/stemmer_test.cpp b/src/mongo/db/fts/stemmer_test.cpp new file mode 100644 index 00000000000..808b8141a64 --- /dev/null +++ b/src/mongo/db/fts/stemmer_test.cpp @@ -0,0 +1,42 @@ +// stemmer_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#include "mongo/unittest/unittest.h" + +#include "mongo/db/fts/stemmer.h" + +namespace mongo { + namespace fts { + + TEST( English, Stemmer1 ) { + Stemmer s( "english" ); + ASSERT_EQUALS( "run", s.stem( "running" ) ); + ASSERT_EQUALS( "Run", s.stem( "Running" ) ); + } + + + TEST( English, Caps ) { + Stemmer s( "porter" ); + ASSERT_EQUALS( "unit", s.stem( "united" ) ); + ASSERT_EQUALS( "Unite", s.stem( "United" ) ); + } + + + } +} diff --git a/src/mongo/db/fts/stop_words.cpp b/src/mongo/db/fts/stop_words.cpp new file mode 100644 index 00000000000..0d664caf1bf --- /dev/null +++ b/src/mongo/db/fts/stop_words.cpp @@ -0,0 +1,73 @@ +// stop_words.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <map> +#include <set> +#include <string> + +#include "mongo/db/fts/stop_words.h" + +#include "mongo/base/init.h" +#include "mongo/platform/unordered_map.h" + + + +namespace mongo { + + namespace fts { + + void loadStopWordMap( std::map< std::string, std::set< std::string > >* m ); + + namespace { + unordered_map<string,StopWords*> STOP_WORDS; + StopWords* empty = NULL; + } + + + StopWords::StopWords(){ + } + + StopWords::StopWords( const std::set<std::string>& words ) { + for ( std::set<std::string>::const_iterator i = words.begin(); i != words.end(); ++i ) + _words.insert( *i ); + } + + const StopWords* StopWords::getStopWords( const std::string& langauge ) { + unordered_map<string,StopWords*>::const_iterator i = STOP_WORDS.find( langauge ); + if ( i == STOP_WORDS.end() ) + return empty; + return i->second; + } + + + MONGO_INITIALIZER(StopWords)(InitializerContext* context) { + empty = new StopWords(); + + std::map< std::string, std::set< std::string > > raw; + loadStopWordMap( &raw ); + for ( std::map< std::string, std::set< std::string > >::const_iterator i = raw.begin(); + i != raw.end(); + ++i ) { + STOP_WORDS[i->first] = new StopWords( i->second ); + } + return Status::OK(); + } + + } + +} diff --git a/src/mongo/db/fts/stop_words.h b/src/mongo/db/fts/stop_words.h new file mode 100644 index 00000000000..5816afa560c --- /dev/null +++ b/src/mongo/db/fts/stop_words.h @@ -0,0 +1,50 @@ +// stop_words.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#pragma once + +#include <set> +#include <string> + +#include "mongo/platform/unordered_set.h" + +namespace mongo { + + namespace fts { + + class StopWords { + public: + StopWords(); + StopWords( const std::set<std::string>& words ); + + bool isStopWord( const std::string& word ) const { + return _words.count( word ) > 0; + } + + size_t numStopWords() const { return _words.size(); } + + static const StopWords* getStopWords( const std::string& langauge ); + private: + ~StopWords(){} + unordered_set<std::string> _words; + }; + + } +} + diff --git a/src/mongo/db/fts/stop_words_danish.txt b/src/mongo/db/fts/stop_words_danish.txt new file mode 100644 index 00000000000..1b3c2867fec --- /dev/null +++ b/src/mongo/db/fts/stop_words_danish.txt @@ -0,0 +1,100 @@ +få +intet +som +den +forrige +ni +alle +at +ned +et +næsten +fordi +og +jeres +seks +op +har +flere +hvis +hvem +andre +mens +fem +over +da +din +deres +for +ny +hvad +fra +kan +kommer +hvornår +jeg +denne +end +nogen +meget +man mand +på +store +ind +lav +ud +ej +hvordan +ingen +to +der +se +kom +dig +tre +eneste +dette +hans +hver +før +hendes +andet +syv +hvilken +hvor +det +de +hvorfor +god +otte +ikke +han +mig +fleste +ti +i +ene +med +til +stor +her +lille +mange +du +ses +begge +dog +eller +en +nyt +var +hun +enhver +fire +mere +nær +næste +men +noget +lidt +af diff --git a/src/mongo/db/fts/stop_words_dutch.txt b/src/mongo/db/fts/stop_words_dutch.txt new file mode 100644 index 00000000000..251822f9570 --- /dev/null +++ b/src/mongo/db/fts/stop_words_dutch.txt @@ -0,0 +1,48 @@ +hun +dit +zij +van +kan +een +zal +wat +dat +in +hij +die +zou +bij +met +al +ook +is +uit +en +hem +zei +heb +mij +was +ik +nog +of +zo +we +men +wij +ons +als +tot +wel +nu +aan +je +er +ze +af +hoe +had +te +me +het +dan diff --git a/src/mongo/db/fts/stop_words_english.txt b/src/mongo/db/fts/stop_words_english.txt new file mode 100644 index 00000000000..fbb6c3063fd --- /dev/null +++ b/src/mongo/db/fts/stop_words_english.txt @@ -0,0 +1,174 @@ +than +during +himself +your +theirs +most +same +wouldn't +at +it +here's +their +his +an +out +between +doesn't +not +those +only +yourself +mustn't +and +shouldn't +him +you'll +which +more +shan't +after +why +up +further +over +no +its +until +them +you +don't +few +why's +i've +for +ours +some +when's +they've +won't +herself +but +she +he'd +how's +were +how +we've +because +aren't +should +our +each +once +they'd +where +above +there +or +they'll +be +to +are +it's +too +itself +what +whom +has +they're +had +she'd +these +other +when +hasn't +by +we'll +having +then +against +he's +as +is +that +isn't +below +could +wasn't +who's +ourselves +so +any +he +cannot +weren't +was +my +would +we'd +yourselves +where's +couldn't +who +didn't +from +i'm +off +have +hers +i +am +themselves +of +before +i'll +here +while +what's +myself +ought +me +the +into +about +this +do +can't +a +her +that's +did +very +down +you've +we +you're +haven't +on +let's +such +they +in +with +being +doing +she's +yours +hadn't +nor +both +does +own +again +there's +he'll +i'd +under +you'd +through +we're +she'll +been +all +if diff --git a/src/mongo/db/fts/stop_words_finnish.txt b/src/mongo/db/fts/stop_words_finnish.txt new file mode 100644 index 00000000000..0d898159660 --- /dev/null +++ b/src/mongo/db/fts/stop_words_finnish.txt @@ -0,0 +1,747 @@ +toistaiseksi +aika +tykö +haluavat +hyvinä +lähekkäin +menimme +mahdollista +silloin +olette +kaikkiaan +alla +haluaa +joku +jopa +uuden +haluat +lähellä +jolta +yhtä +sieltä +oleva +niistä +pienelle +samallasta +pieni +uudet +avulla +kahdessa +edes +aloitin +koska +pienin +olisi +esi +paljon +hyville +toisaalta +vaikeat +kumpikin +ensimmäiseksi +jonka +jotka +kennessästä +ensimmäistä +joihin +peräti +kymmenen +hänessä +vaikean +lähtien +neljän +olisin +ette +joutui +jokin +pienempi +vieri +kuka +viime +vaiheessa +olisit +neljää +aikovat +mikä +kaikkea +muiden +niiden +kumpikaan +kenellä +esillä +joo +menit +joudutte +olisivat +ne +sinua +muun +täältä +mitään +kohti +usea +ilmeisesti +koskaan +vähemmän +menossa +ajan +liian +omat +tietysti +jota +kenen +kahdella +aikaa +ensimmäiset +parhaillaan +muualla +joukosta +moi +hyvät +näissä +kuin +minut +ainoa +nro +eräiden +kanssasi +seuraavat +yhtäällä +jonkin +näissästä +ehkä +aloitamme +jälkeen +kanssa +aloititte +voisi +viimeinen +jolle +toki +myöskään +sekä +josta +eli +moni +olli +ellet +alkuisin +tulleet +kaikkialle +ainakin +muita +aikana +toinen +muuta +keiltä +nämä +meidän +tuhannen +edemmäs +mihin +vierekkäin +sinulle +yhteyteen +halusi +etenkin +täytyvät +niitä +mitä +alta +välillä +seitsemän +kumpainen +hyviltä +taa +me +mikään +hän +uudelleen +omien +kerta +tuhat +yksi +tulisit +tulimme +joissa +monesti +paitsi +heti +aloitat +joko +eniten +vuoden +yhdessä +suuri +haluamme +varten +olisitte +kenet +entinen +samaa +vaikeille +samallassa +tänään +edelleen +läpi +helposti +täysin +monta +jolloin +useimmiten +ensin +sinusta +suuntaan +omilta +uuteen +voidaan +näissälle +sisällä +siitä +kuitenkaan +mones +mukaan +asiasta +toisaalla +tällöin +että +yksittäin +erityisesti +vaikea +ellei +moniaalla +aluksi +antoi +tavoitteena +verran +runsaasti +ovat +menevät +toista +meni +vastaan +myös +heitä +menin +kokonaan +ja +hänen +suurin +meille +tosin +hetkellä +parhaiten +ensimmäisenä +kaksi +tulemme +alle +vuosina +ensimmäisen +aloitan +emme +te +heille +uusia +itse +halunnut +keille +samallalta +halusin +aikoo +minun +menet +näissähin +meneet +kovin +jouduin +toisen +tämä +halusit +useita +muassa +heiltä +tulette +kahta +jouduit +haluton +omia +minne +aloitettevaksi +siten +sinussa +aloitettu +voi +suuren +asioihin +vaikeilta +vähiten +asiat +vuosien +ellen +joilla +jälleen +suuret +omalta +tässä +asian +nuo +menette +kuten +kolme +voimme +eri +menivät +tahansa +vai +kaiken +joiden +puolesta +hyvissä +kumpainenkaan +kaikille +yhtäälle +toiseksi +hänelle +toisessa +toisesta +toiselle +keillä +lisäksi +tuolloin +muualle +edessä +älköön +keneen +mennyt +aikoina +myöhemmin +annettu +menemme +aoua +ensimmäisiä +mikäli +pieneen +keinä +annetteva +vähän +tuolla +jää +kannattaa +jonne +mukana +kolmas +häneen +jouduitte +joten +tänne +aloittivat +olitte +juuri +keitten +tullut +paremmin +sillä +meillä +avutta +keneltä +kahden +jos +omaan +aion +jo +samaan +sama +joskus +sinä +kaikki +yhtään +omille +sataa +pian +ihan +eivät +ylös +keissä +voin +siinä +lisää +halutessa +tämän +omaa +asioita +kanssanne +tapauksessa +kenestä +vaikeilla +minä +suurten +kenelle +tulisivat +melkein +kun +kaikkia +uutta +lähinnä +ole +hyvä +aiotte +asia +hyvää +uusi +tulisimme +aikajen +viimeisen +mukaansa +eteen +hänestä +omiksi +heistä +edestä +kukin +hyviksi +moniaalle +missä +takaa +omista +joilta +saakka +onkin +yhdeksän +sitä +ollut +hänet +antamatta +ensimmäisinä +avun +yhteensä +kolmen +hyviä +olemme +muulloin +kumpi +yli +sinun +erittäin +aiemmin +ympäri +voivat +keiksi +tulitte +tästä +siis +perusteella +sinut +täten +menen +asti +muu +omalle +haluamatta +keitä +apu +menee +yhä +voit +varsin +neljä +yhden +uusinta +tuolta +yksin +keskimäärin +entistä +sinne +samat +jotenkin +eilen +luo +takana +edellä +useasti +muutama +he +sitten +sinulta +pieneksi +enää +itsensä +tulla +joille +jotenkuten +kahdesta +kuutta +omissa +parempi +pakosti +olla +joutua +aloitattivat +silti +kahdelta +joudun +viiden +kehen +paikoittain +kukaan +näiden +aloitti +olevan +myöskin +jouduimme +mistä +tähän +takaisin +eräät +tai +tulee +siihen +olimme +molemmat +kesken +keneksi +jotta +toisella +tätä +jompikumpi +ennen +täytyy +kolmesti +voitte +pieneltä +en +niin +keistä +jotain +ensimmäisiksi +esimerkiksi +pienellä +takia +on +kautta +muka +oli +kai +hänellä +onko +itseään +joutumaan +aina +aivan +aloitimme +taemmas +todella +ensi +tulisitte +taas +mutta +vuotta +nyt +kenettä +tule +enemmän +vain +usein +haluatte +heissä +kannalta +avuksi +tuonne +toiseen +kaikkialta +hyviin +olevat +olisimme +uudeksi +tulen +elleivät +huomenna +olleet +moniaalta +kahdeksan +nopeasti +joudumme +heihin +antaa +johon +ei +hei +omassa +saman +lähelle +et +keiden +kanssani +yhteen +huolimatta +häneltä +alussa +olivat +aloitatte +kuuden +annettavaksi +esiin +näissältä +joukossa +halusimme +tuntuu +kenenä +vuosi +hyvien +tällä +tulevat +joista +läheltä +tulin +jokainen +joutuivat +vaikka +asiaa +halusivat +päälle +joukkoon +lähemmäs +joutuu +joutuvat +yhteydessä +vaikeista +nopeammin +vuoksi +toiselta +koko +hieman +kauemmas +kertaa +muutaman +kyse +kahteen +sen +edeltä +kuusi +varsinkin +toisensa +jossa +kaikilta +mahdollisimman +tavalla +näin +liki +joka +olin +entisen +aist +oma +yleensä +nopeiten +tulisi +viisi +vähintään +hyvistä +aikaisin +kahdelle +miksi +olit +joita +halua +asioiden +miten +aiomme +heidän +kaikkin +ylemmäs +näitä +aloittaa +kahdeksannen +ketä +vielä +aikaisemmin +puolestaan +aloitit +jolla +myötä +omaksi +lähes +kiitos +hyvin +samalla +sijaan +milloin +sisäkkäin +se +olen +noin +alemmas +kaikkialla +muuten +ellemme +pienestä +omiin +olet +aikaan +tuskin +yhtäältä +siellä +keittä +kuinka +vastakkain +halusitte +ilman +sata +siksi +tuo +tulet +suuria +muut +satojen +eräs +kaikkien +tulivat +varmasti +ohi +muualta +kyllä +haluan +kenessä +ensimmäinen +mikin +tulisin +oman +joiksi +edelle +entisiä +entisten +kauan +alas +ainoat +ellette +suoraan +eikä +melko +alusta +ettei +alkuun +aloitettava +monet +ketkä +uusien +tulit +kuitenkin +aloittamatta +vaan +toisemme +keihin +viimeksi +toisaalle +muuanne +ainakaan +tänä +oikein +mennessä +samoin +kerran +kanssaan +sadam +täällä +kanssamme +vasta +vaikeissa +älä diff --git a/src/mongo/db/fts/stop_words_french.txt b/src/mongo/db/fts/stop_words_french.txt new file mode 100644 index 00000000000..42502590cbc --- /dev/null +++ b/src/mongo/db/fts/stop_words_french.txt @@ -0,0 +1,126 @@ +vu +avoir +sa +étions +faites +ni +quand +ou +tu +tout +et +hors +haut +avec +votre +ton +tes +dedans +comment +il +aussi +vont +fait +soyez +peu +ils +sans +avant +valeur +été +pas +font +mine +encore +sien +le +mais +ci +quels +devrait +mot +droite +ma +pièce +tels +force +dans +qui +trop +ça +juste +au +dehors +eu +cela +voient +leur +notre +plupart +elles +dos +tous +elle +bon +étaient +nommés +je +ces +nouveaux +donc +est +pourquoi +car +son +alors +peut +quel +quelle +pour +essai +ici +sujet +mon +état +depuis +même +que +où +parole +être +voie +autre +sont +sur +des +début +ta +la +deux +ce +doit +quelles +du +ses +là +tandis +personnes +en +par +chaque +parce +fois +si +les +nous +mes +vous +aucuns +seulement +moins +maintenant +ceux +tellement +très +comme +sous diff --git a/src/mongo/db/fts/stop_words_german.txt b/src/mongo/db/fts/stop_words_german.txt new file mode 100644 index 00000000000..4fcf107963b --- /dev/null +++ b/src/mongo/db/fts/stop_words_german.txt @@ -0,0 +1,992 @@ +kleiner +muss +zwanzig +berichtete +dort +nun +könnten +fortsetzten +seinen +important +gefiel +sofort +liegt +wolle +eben +in +dadurch +sind +geben +etc +wenngleich +erhalten +ihr +solchen +bezüglich +bald +befragte +werde +geehrten +singt +fragte +jedem +eines +behalten +wegen +freies +einseitig +dasselbe +befragen +unmögliche +weg +nachhinein +senkte +ihrem +drunter +später +wäre +daraus +ausdrückte +sagt +einmal +inzwischen +startet +ganzem +lesen +vermag +forderten +fast +wieviel +alsbald +damals +gewollt +mithin +behielt +erhielten +irgendwen +geteilt +bearbeitete +eröffnen +bist +daran +will +diese +danke +einst +oft +legte +gleichwohl +demnach +begonnen +reagieren +derartig +links +dieser +soweit +worin +möglichen +folgender +desto +zahlreich +wogegen +sich +jenen +schnell +meta +wen +rechts +abgerufener +muesste +vorbei +nacher +nebenan +schwierig +senkt +für +soviel +deswegen +blieb +veröffentlichtes +dagegen +veröffentlichen +einem +stets +hast +demselben +wollen +nötigenfalls +sondern +zugleich +umso +ins +wolltet +deine +danach +konkreter +diesen +abgerufene +mancherorts +befragten +teilten +tat +übermorgen +unse +manche +weshalb +senken +solltest +woher +siebte +sofern +so +guten +vollständig +meiste +doppelt +veröffentlicht +derselbe +zuviel +dem +hinterher +veröffentlicher +ziehen +mich +anderem +überallhin +können +lagen +weiteres +jedenfalls +muessen +eigenen +beitragen +gmbh +keines +pfui +befragter +falls +ergänzten +ergänze +vielleicht +hundert +koennen +entweder +muesst +durchaus +sollt +jemand +solch +wirklich +fortsetzt +sage +derjenige +angesetzt +gleichzeitig +derselben +diesseits +anderm +gern +keinen +gehen +könnt +daher +allerdings +wohingegen +steigen +unserm +unserem +nämlich +heraus +wodurch +obgleich +bloss +seit +aufhören +verrate +je +wohlweislich +einführten +tatsächlichen +ganzes +kommen +waren +alles +komme +einige +dinge +erst +tragen +machst +direkten +einiger +anerkannter +wären +ganz +woraufhin +darauf +jedoch +daneben +einführen +längstens +koennten +beide +such +dorthin +möglich +ausser +unterhalb +vor +darfst +sicherlich +geworden +stattdessen +beinahe +gängig +gibt +sieht +schätzte +euren +obwohl +vom +könnte +vergangenes +wolltest +welchen +wiewohl +schreiber +deshalb +mochte +unses +aber +tät +manches +eröffnet +anders +gängiges +seines +seitdem +titel +musste +jenes +unbedingt +trotzdem +sowohl +kaum +davon +möchten +gemacht +veröffentlichten +verraten +sangen +glücklicherweise +somit +gefallen +eurer +machte +womit +übrig +ihren +sollten +weder +heute +regelmäßig +meiner +was +legten +vorher +niemals +dein +derart +schließlich +außen +noch +zwar +vieles +wenig +fünf +anderes +mussten +schätzen +sagten +angesetze +eigentlich +wann +deines +letztes +fordert +einig +meinem +hiesige +möchte +bin +möglicher +und +z.B. +unten +angesetzten +sehr +dessen +ihrer +beim +weiterhin +weiteren +tun +solcher +hattet +würden +viel +gängige +an +vermutlich +müssten +kommt +lediglich +zehn +mehr +wohin +nie +gedurft +einerseits +besonders +teilen +hätte +darüberhinaus +warst +es +allerlei +ander +unserer +versorgtes +benutzt +hätt +ausdrücken +hab +usw +keine +veröffentlichte +stieg +neues +ab +leicht +allen +könn +fortsetzen +keineswegs +herein +siehe +waere +derem +möchtest +überdies +allgemein +einseitiger +überall +möglicherweise +solche +obschon +beitrugen +gratulieren +fordern +denen +versorgt +mehrere +haette +ueber +meisten +wenige +gesagt +ausgenommen +drauf +einfach +verrieten +hätten +leer +euer +unmöglich +eingesetzt +gefälligst +zur +dannen +gänzlich +anerkannt +zumal +derjenigen +alle +anfing +nichts +mir +wird +jedes +hallo +dafür +spielen +ausserdem +einstmals +ca. +einseitigen +außerhalb +manchen +ggf +andern +unsen +wär +selbst +eröffne +ob +findest +höchstens +dich +suchen +mögen +direkter +sobald +folglich +einigen +dunklen +nicht +ganzen +bearbeiten +bereits +müssen +jährigen +steigt +meist +sicher +seid +geblieben +info +ein +währenddessen +derzeit +finden +ersten +bei +dabei +nützt +bestehen +oberhalb +wer +seht +nachdem +wo +wollte +gratuliert +wenigstens +schreibens +author +bearbeite +koennte +dritte +lichten +sehen +dreißig +vorne +eurem +geehrte +uns +hinein +ihnen +daß +konkrete +irgendwie +sollen +natürlich +also +hin +doch +welches +geb +gar +deinem +ferner +eher +eine +hiermit +neben +gefällt +tatsächlicher +beides +eröffnetes +dennoch +etliche +wurde +brachten +zwölf +setzen +zudem +berichten +müsste +reagiert +sollst +hattest +jederlei +tut +denselben +euch +auch +konkretes +gemocht +stiegen +kannst +mache +lag +kein +unsere +ungefähr +txt +klaren +außer +weiß +braucht +tust +dürfte +nutzen +dazu +ergänzte +warum +konnte +sog +einführte +morgen +ihm +genug +durfte +hatte +eröffnete +igitt +neuen +musst +der +wenn +fortsetzte +wessen +insofern +rund +vorgestern +irgendwas +erhielt +gewesen +anderen +geht +angesetzter +frei +teile +kleines +keiner +nächste +gbr +die +etwa +im +wir +zusammen +bräuchte +weiterem +bleiben +starteten +davor +liest +durch +niemand +zum +jeden +einer +seiten +freier +dank +oben +hat +eigenes +soll +beträchtlich +um +sodaß +gib +erste +völlig +oder +geehrter +setzten +fall +aller +aufzusuchen +seither +versorgten +böden +sonst +nein +machen +zog +danken +einiges +anerkanntes +müßt +setzt +eins +bedürfen +gegeben +konkret +bietet +voran +unter +gekonnt +geteilte +jähriges +diejenige +sagen +bsp. +folgende +du +allmählich +dieses +sogar +ende +folgendes +abgerufen +neue +vorüber +reagiere +nirgendwo +welche +abgerufenes +vergangener +sein +sowie +welchem +übel +sie +plötzlich +müßte +mancher +dahin +auf +gängiger +seiner +bearbeiteten +jährige +machten +solc hen +autor +beiderlei +aufgehört +mal +haben +jener +sei +per +sagtest +legen +unterbrach +bekannt +mindestens +acht +jenseits +bedarf +sooft +habe +da +unsem +zuerst +versorge +hinten +eure +letztlich +beiden +sechs +mögliche +werdet +eures +wollt +bringen +immer +unterbrechen +besser +dies +erneut +dir +war +einigem +drei +findet +letztens +gängigen +derer +klares +mußt +schätzten +margin +besteht +seine +langsam +einbaün +sang +drin +nach +weiterer +zwischen +ganze +vielmals +gesehen +getragen +klein +indem +frau +teilte +hinter +genommen +bekennen +darum +seinem +ehe +indessen +begann +hoch +solange +kleinen +das +berichteten +sect +ohne +leider +solchem +nur +durften +vergangen +bloß +direkte +unseren +bekannte +berichtet +andernfalls +verriet +steige +ergänzen +zurück +dieselben +jede +morgige +jene +kam +trug +nimm +zieht +letzten +wieder +zogen +konnten +willst +allzu +angefangen +nehmen +ganzer +geehrt +mann +trage +bis +schlechter +sehe +mag +unmöglichen +werden +unnötig +dieselbe +bedurfte +gemäss +bevor +sämtliche +anerkannte +gab +konkreten +unwichtig +wieso +senke +keinerlei +senkten +allem +anfangen +erster +denn +ich +total +gleich +schreibe +versorgen +gestrige +zu +bekannter +aufhörte +irgendwer +gekommen +ist +darf +tausend +manchmal +zufolge +andere +vier +hatten +neuem +fuer +gewissermaßen +unmöglicher +jenem +wem +damit +sonstwo +nirgends +sagte +kann +unser +klar +macht +folgenden +wichtig +woraus +nahm +darüber +jeder +arbeiten +einen +wurden +vielen +tatsächlich +trägt +man +soeben +wohl +ausdrückt +ebenso +muß +wachen +lassen +letze +zuletzt +manchem +immerhin +diesem +startete +weitere +nutzt +statt +etwas +aus +reagierte +beginnen +nimmt +infolge +brauchen +hier +dann +nacht +forderte +bislang +desselben +über +meines +weiter +ja +rief +aufgrund +wollten +den +als +vieler +neun +gehabt +anderer +gebracht +irgendwo +sollte +andererseits +dürfen +heutige +deiner +selber +meinen +entsprechend +dass +wirst +gratulierte +bisher +vermögen +brachte +setzte +ihres +weniger +laut +magst +versorgte +klare +während +darin +schätzt +gegen +einigermaßen +solches +keinem +außerdem +am +freie +gestern +anstatt +fand +wie +vergangene +welcher +deinen +schreiben +jetzt +ansetzen +halb +allein +einseitige +tatsächliches +längst +ebenfalls +übrigens +er +mit +unseres +drüber +sieben +darunter +jährig +gute +seite +innen +nimmer +neuer +singen +erhält +irgend +künftig +deren +zwei +einzig +bzw +getan +version +steht +zeitweise +irgendeine +direkt +schon +innerhalb +mein +ihn +viele +dorther +meine +des +würde +nutzung +ihre +pro +letztendlich +von +neu +ähnlich +tages +sehrwohl +weil +koennt diff --git a/src/mongo/db/fts/stop_words_hungarian.txt b/src/mongo/db/fts/stop_words_hungarian.txt new file mode 100644 index 00000000000..abdfe8b7498 --- /dev/null +++ b/src/mongo/db/fts/stop_words_hungarian.txt @@ -0,0 +1,35 @@ +nem +be +mi +igen +a +fel +van +õk +csak +hát +én +meg +lesz +szét +az +és +ide +ön +le +ki +össze +õ +hogy +mint +te +oda +vagy +ti +vissza +egy +át +de +el +rá +volt diff --git a/src/mongo/db/fts/stop_words_italian.txt b/src/mongo/db/fts/stop_words_italian.txt new file mode 100644 index 00000000000..50ebb1c32c3 --- /dev/null +++ b/src/mongo/db/fts/stop_words_italian.txt @@ -0,0 +1,279 @@ +tuoi +farò +all +stai +avevamo +avevo +nelle +fanno +stesse +dagli +avesti +stiamo +abbiate +ai +facevi +avevi +dall +sull +sarebbero +dalle +abbiamo +avessi +stettero +nella +avrete +del +stessi +stavate +farai +dalla +tutti +starò +facemmo +avesse +tu +stiano +faceste +dallo +tutto +avevate +farete +faceva +nello +e +sulle +loro +sono +se +degli +fu +sarai +il +avrei +avuto +stessimo +a +sarete +avrebbero +stavano +sulla +in +si +farà +c +facessi +avevano +sul +li +fecero +sia +starebbe +eravamo +lo +fui +contro +furono +suoi +steste +avuti +o +starei +faremmo +staranno +sullo +avrò +fece +i +della +agli +stesti +avute +la +non +saranno +gli +facessimo +starà +dal +avendo +coi +delle +faccio +nei +fareste +stando +negl +su +anche +avuta +da +facesse +le +facciamo +è +siate +ebbero +faranno +facevate +foste +dello +avrebbe +facessero +facevano +feci +al +stemmo +fai +cui +fummo +quale +avrà +perché +farebbe +mi +negli +faccia +di +siete +facciano +sta +stavo +sarebbe +faremo +degl +stavamo +dov +dei +uno +saremo +starete +faresti +stavi +staresti +essendo +facevamo +nostro +ad +tra +avresti +alle +starai +nell +fossero +abbiano +fosti +nostri +per +ma +stia +saresti +una +sue +dove +siamo +eravate +starebbero +facendo +lei +avreste +agl +alla +sto +stava +nostre +quelli +stette +hanno +sua +dagl +quello +voi +staremmo +vi +nostra +stareste +sarò +avessimo +allo +siano +io +come +suo +facciate +saremmo +ci +quella +era +l +avemmo +miei +sareste +ed +sui +quanto +avete +un +con +ero +vostre +questo +nel +quanti +più +ha +stiate +quelle +stetti +col +avremmo +fosse +questi +noi +tua +ho +mia +farebbero +erano +vostra +ebbi +farei +quante +aveste +abbia +stessero +staremo +avessero +eri +avranno +fossi +queste +hai +sarei +avrai +tue +chi +sei +stanno +mie +dell +ebbe +ne +quanta +dai +avremo +aveva +vostro +questa +che +fossimo +sarà +facevo +vostri +lui +ti +sugl +tuo +facesti +sugli +mio diff --git a/src/mongo/db/fts/stop_words_norwegian.txt b/src/mongo/db/fts/stop_words_norwegian.txt new file mode 100644 index 00000000000..daf5f27d9c8 --- /dev/null +++ b/src/mongo/db/fts/stop_words_norwegian.txt @@ -0,0 +1,119 @@ +som +alle +et +vÖre +gjÛre +slik +ha +nÅ +fordi +og +skulle +sist +hvis +vil +hvem +andre +slutt +mens +siden +sÅ +over +lage +da +din +vite +deres +disse +for +hva +Å +hennes +kunne +ny +ved +fra +lang +mer +kan +er +denne +verdi +riktig +bruke +meget +opp +mÅ +mye +sant +samme +mÅte +hvordan +der +ville +uten +eneste +hans +oss +hver +like +tilstand +arbeid +hvilken +fÅ +hvor +folk +det +ut +start +gÅ +hvorfor +god +tid +ikke +meg +han +stille +bra +fÛrst +i +ene +fÛr +ogsÅ +enn +rett +navn +lik +makt +med +av +til +inn +vÅr +pÅ +her +nÅr +mange +du +forsÛke +begge +vi +part +eller +hadde +tilbake +en +var +enhver +si +vÖrt +mest +om +gjorde +men +min +punkt +bort +under +nei +innen diff --git a/src/mongo/db/fts/stop_words_portuguese.txt b/src/mongo/db/fts/stop_words_portuguese.txt new file mode 100644 index 00000000000..5b14cf26243 --- /dev/null +++ b/src/mongo/db/fts/stop_words_portuguese.txt @@ -0,0 +1,147 @@ +para +o +está +como +quem +ou +eles +nosso +este +não +tu +muitos +somente +corrente +parte +quando +todos +por +das +estão +têm +poderá +com +qualquer +fora +nome +estar +pode +também +novo +bem +meu +dois +desde +ver +fez +estado +ser +ir +aquelas +trabalhar +promeiro +iste +usar +teu +é +muito +mais +tentar +quieto +aquele +teve +quê +comprido +ela +irá +tentaram +deverá +você +estará +são +fará +desligado +tenho +e +podia +então +direita +isto +esteve +tive +tal +eu +debaixo +dentro +foi +último +caminho +tentei +diz +valor +fazer +pessoas +fazia +tipo +deve +acerca +pelo +trabalho +dos +em +veja +aqueles +uns +devem +alguns +verdade +nós +uma +tente +bom +mas +conhecido +algmas +saber +umas +usa +qual +um +porque +seu +estivemos +inicio +horas +ambos +cima +outro +maioria +faz +estive +aquela +os +pegar +estiveram +ista +sem +mesmo +povo +apontar +fim +estes +ligado +atrás +dizer +maiorias +iniciar +aqui +ele +tempo +enquanto +ali +tem +antes +verdadeiro +onde +agora +cada diff --git a/src/mongo/db/fts/stop_words_romanian.txt b/src/mongo/db/fts/stop_words_romanian.txt new file mode 100644 index 00000000000..1a7cb994c86 --- /dev/null +++ b/src/mongo/db/fts/stop_words_romanian.txt @@ -0,0 +1,258 @@ +vreo +acelea +cita +degraba +lor +alta +tot +ai +dat +x +despre +peste +bine +dar +foarte +z +avea +multi +cit +alt +mai +sa +fie +tu +multe +e +orice +dintr +se +g +intr +niste +multa +insa +il +fost +a +abia +nimic +sub +acel +in +altceva +si +avem +altfel +c +ea +acest +li +parca +fi +dintre +unele +m +acestei +mare +cel +este +pe +atitia +uneori +acela +iti +astazi +acestui +o +imi +ele +ceilalti +pai +fata +noua +sa-ti +altul +au +i +prin +conform +aceste +anume +azi +k +unul +ala +unei +fara +ei +la +aceeasi +u +inapoi +acestea +acesta +catre +sale +asupra +as +aceea +ba +ale +da +le +apoi +aia +suntem +cum +isi +inainte +s +de +cind +cumva +chiar +acestia +daca +sunt +care +al +numai +cui +sus +tocmai +prea +cu +mi +eu +doar +niciodata +exact +putini +aiurea +tuturor +celor +astfel +atunci +citeva +cat +sau +fel +intre +acolo +nostri +ma +mult +una +ceea +iar +sintem +ati +din +geaba +sai +caruia +adica +inca +are +aici +ca +ia +nici +d +oricum +asta +carora +face +citiva +voi +unor +f +atat +toata +alaturi +cea +nu +totusi +ce +altii +acum +sint +capat +mod +deasupra +cam +vom +b +toate +careia +aceasta +atit +nimeni +ii +ci +unde +ul +plus +era +sa-mi +l +spre +dupa +nou +cele +acea +un +incit +n +cei +or +va +deci +acelasi +atatea +h +vor +decit +noi +cineva +desi +ceva +j +ului +atitea +avut +ar +pina +t +atata +unui +el +citi +asa +totul +pentru +atita +v +alti +asemenea +atatia +te +ne +deja +unii +p +atare +cite +cine +cand +toti +vreun +ori +r +alte +lui +ti +ni +aceia +am diff --git a/src/mongo/db/fts/stop_words_russian.txt b/src/mongo/db/fts/stop_words_russian.txt new file mode 100644 index 00000000000..b44c0fc7011 --- /dev/null +++ b/src/mongo/db/fts/stop_words_russian.txt @@ -0,0 +1,421 @@ +четвертый +многочисленное +там +ты +обычно +даром +через +из +туда +каждый +начала +алло +он +за +мор +вам +долго +только +пока +быть +этим +ими +важные +раз +да +теми +никогда +е +менее +под +раньше +них +прекрасно +сама +время +семнадцать +несколько +люди +чаще +им +действительно +том +десятый +везде +тою +четырнадцать +вся +тринадцать +какой +такая +внизу +разве +нее +моя +наш +зато +каждая +ними +моё +почти +другие +отовсюду +к +которых +должно +затем +которые +более +важное +давно +рано +всему +может +второй +пятый +тысяч +кто +тому +тоже +ваши +нею +меля +нужно +ни +нх +был +твоё +часто +ею +позже +твоя +вас +двадцатый +нибудь +именно +друго +самими +своих +себе +семь +процентов +одиннадцать +этом +нему +сказать +вдали +всеми +другой +тобою +хорошо +сказала +теперь +были +вверх +двух +неё +говорит +сегодня +тех +потому +этого +ну +пятнадцать +будете +хоть +сих +занято +года +бы +конечно +восемнадцатый +которой +девять +пожалуйста +после +этими +мало +впрочем +без +двенадцать +было +совсем +этот +так +она +непрерывно +свою +нет +хотеть +себя +самого +оба +многочисленная +назад +бывь +кого +где +будут +буду +ней +можно +всем +ещё +сам +вами +мне +кругом +мог +шестнадцатый +о +эта +такие +г +такой +еще +который +мной +самой +ком +то +те +восемнадцать +весь +занят +оно +сказал +сначала +с +которая +нередко +никуда +много +со +ниже +хотя +которого +других +против +однажды +восемь +будь +наша +ли +нас +особенно +тебе +четырнадцатый +две +иметь +первый +вы +иногда +кем +человек +самому +и +во +будет +это +три +заняты +этих +пятнадцатый +довольно +чтоб +все +над +их +мы +ту +будто +от +всего +что +но +при +день +всё +собою +ваш +очень +посреди +говорил +наше +год +опять +третий +четыре +чего +шесть +кажется +немного +семнадцатый +мой +недавно +ему +дел +в +далеко +также +такое +того +будем +однако +времени +какая +ваше +бывает +сами +я +потом +т +не +уж +надо +занята +низко +когда +вдруг +пять +году +вон +девятнадцать +седьмой +девятый +тебя +здесь +этой +шестнадцать +миллионов +стал +самих +снова +чуть +самим +одной +или +ей +саму +мира +каждое +больше +ваша +была +чтобы +просто +двадцать +само +суть +на +слишком +эту +твой +жизнь +тринадцатый +мимо +тут +восьмой +многочисленный +тем +её +него +вокруг +об +кроме +недалеко +лишь +хочешь +один +наиболее +пора +мои +как +м +чем +меня +нами +есть +близко +почему +уже +тобой +мною +его +они +девятнадцатый +у +перед +всех +пор +всею +вот +чему +значит +каждые +уметь +зачем +нем +между +всюду +своей +двенадцатый +важный +около +мож +для +можхо +другая +до +ж +эти +самом +даже +кому +всегда +сейчас +ее +одиннадцатый +наверху +куда +одного +своего +меньше +важная +свои +по +могут +про +тот +тогда +свое +лучше +сколько +будешь +наконец +вниз +всю +спасибо +многочисленные +ведь +наши +два +шестой +имя +же +сеаой +отсюда +рядом +а +лет +собой +другое +дальше +ничего +мочь +если +та +десять +этому +нельзя +нам diff --git a/src/mongo/db/fts/stop_words_spanish.txt b/src/mongo/db/fts/stop_words_spanish.txt new file mode 100644 index 00000000000..04132dc6b66 --- /dev/null +++ b/src/mongo/db/fts/stop_words_spanish.txt @@ -0,0 +1,177 @@ +para +tuyo +usan +primero desde +sabes +como +aquellos +largo +ante +podriais +sin +incluso +un +intento +eras +cierto +una +otro +consigues +ha +bien +tras +alguna +hacemos +podrian +tiempo +por +pero +verdadera +podrias +somos +fue +muchos +podeis +modo +intentas +el +trabajar +bajo +fin +atras +ultimo +puedo +hace +ellas +aquel +intenta +estado +ir +ser +haces +las +tener +entre +vais +cierta +van +usar +intentan +su +nos +trabajamos +verdad +estan +trabajan +estoy +ellos +empleas +algunas +también +siendo +muy +solamente +pueden +ciertas +yo +tengo +estamos +unos +hacen +los +empleo +dentro +sus +valor +sois +vosotros +eramos +sabe +tiene +fui +voy +consiguen +podemos +esta +trabajais +entonces +dos +verdadero +saben +trabajas +era +vaya +estaba +usamos +poder +podriamos +vosotras +mio +encima +consigo +usas +teneis +algún +solo +podria +lo +tienen +conseguimos +trabajo +saber +usa +soy +eran +ampleamos +porque +emplear +es +donde +fueron +ambos +tenemos +sabeis +fuimos +eres +va +empleais +cuando +haceis +con +mientras +intentar +aquellas +conseguir +puede +emplean +la +algunos +sobre +vamos +estais +quien +hacer +ciertos +aqui +usais +todo +en +intentais +bastante +uno +sabemos +unas +si +consigue +trabaja +alguno +uso +antes +intentamos +hago +cual +arriba +por qué +gueno +nosotros +cada diff --git a/src/mongo/db/fts/stop_words_swedish.txt b/src/mongo/db/fts/stop_words_swedish.txt new file mode 100644 index 00000000000..fb5ecc1d275 --- /dev/null +++ b/src/mongo/db/fts/stop_words_swedish.txt @@ -0,0 +1,386 @@ +bort +ert +gör +likställda +skall +nittonde +finns +artonn +flera +kanske +tills +gick +gjorde +tjugoett +fjorton +siste +hundra +senare +varit +sjuttonde +hundraen +sina +honom +den +nederst +viktigare +allas +del +vilket +vad +större +möjligen +åttio +tolfte +kommer +heller +inför +viktigast +sista +gjort +och +tidigt +mitt +tio +gäller +fler +komma +fem +tretton +smått +mittemot +jämfört +eller +åtta +vart +mycket +aderton +enligt +minst +sjunde +kommit +möjligt +ingenting +liten +goda +längst +när +nummer +femtionde +mest +adertonde +över +alltid +från +fyra +det +göra +enkla +nr +förlåt +varsågod +under +hon +övermorgon +sextionde +något +flesta +fyrtio +gått +nittio +gällt +ute +annat +bland +långsammare +nästa +in +hennes +kunna +ursäkt +fram +dagar +du +innan +varken +elva +två +vill +lätt +mera +inuti +dig +tillsammans +kunnat +femte +som +även +någon +kr +vänstra +åtminstone +allt +utanför +nedersta +annan +noll +nittionde +får +i +gå +tidig +säga +stora +sig +behövt +åttionde +genom +helt +varför +behövde +slutligen +alltså +fjortonde +sitt +tionde +där +delen +imorgon +kvar +tjugotvå +jag +verkligen +andra +ettusen +hellre +mina +inom +vår +tidigare +sextonde +kom +han +värre +om +högre +sämst +vilka +bättre +mindre +första +vårt +ur +tredje +också +finnas +gälla +möjlig +legat +behövas +fjärde +ut +litet +de +har +bakom +deras +små +övre +detta +till +hit +ofta +tjungo +för +sagt +beslutat +att +min +så +blivit +nio +mellan +vems +olika +nionde +upp +mot +ditt +nödvändigt +dina +länge +hur +haft +femtio +nitton +efter +framför +idag +inne +beslutit +dagarna +tjugoen +möjligtvis +rätt +därför +på +igår +gärna +samma +stort +dem +sjuttionde +igen +tjugonde +senast +säger +båda +då +ännu +inga +vidare +tidigast +längre +hög +godare +störst +bli +sex +inget +trettionde +din +sexton +hans +vem +våra +tre +går +få +åttonde +nedre +dagen +sedan +alla +sist +man +långsammast +bra +ligga +femton +många +henne +femtonde +helst +tolv +förra +lika +aldrig +nu +vid +sjuttio +nödvändigtvis +långsam +viktigt +vi +ibland +ingen +enkelt +överst +enkel +viktig +några +med +nödvändiga +kunde +höger +tack +dag +ska +fin +fanns +eftersom +oss +adjö +era +andras +ja +snart +rakt +bäst +lättast +ner +vilken +sextio +följande +bådas +lilla +hundraett +måste +utan +trettio +lättare +långsamt +men +än +ha +god +kan +av +skulle +vänster +sjutton +sämre +olikt +varifrån +var +stor +sju +långt +gott +före +tjugo +någonting +mig +fyrtionde +elfte +inte +genast +likställd +nej +hade +ligger +bara +borta +trettonde +sade +beslut +dock +lite +vara +redan +ned +en +fick +tjugotre +dess +artonde +knappast +fått +här +högst +tvåhundra +ett +blir +sent +sin +sjätte +nödvändig +mer +er +ni +blev +nog +godast +dit +oftast +behöva diff --git a/src/mongo/db/fts/stop_words_test.cpp b/src/mongo/db/fts/stop_words_test.cpp new file mode 100644 index 00000000000..8d70600ce8e --- /dev/null +++ b/src/mongo/db/fts/stop_words_test.cpp @@ -0,0 +1,32 @@ +// stop_words_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/db/fts/stop_words.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + namespace fts { + + TEST( English, Basic1 ) { + const StopWords* english = StopWords::getStopWords( "english" ); + ASSERT( english->isStopWord( "the" ) ); + ASSERT( !english->isStopWord( "computer" ) ); + } + + } +} diff --git a/src/mongo/db/fts/stop_words_turkish.txt b/src/mongo/db/fts/stop_words_turkish.txt new file mode 100644 index 00000000000..66dea7e2dec --- /dev/null +++ b/src/mongo/db/fts/stop_words_turkish.txt @@ -0,0 +1,114 @@ +mu +onlar +seksen +ama +trilyon +buna +bizim +þeyden +yirmi +altý +iki +seni +doksan +dört +bunun +ki +nereye +altmýþ +hem +milyon +kez +otuz +beþ +elli +bizi +da +sekiz +ve +çok +bu +veya +ya +kýrk +onlarýn +ona +bana +yetmiþ +milyar +þunu +senden +birþeyi +dokuz +yani +kimi +þeyler +kim +neden +senin +yedi +niye +üç +þey +mý +tüm +onlari +bunda +ise +þundan +hep +þuna +bin +ben +ondan +kimden +bazý +belki +ne +bundan +gibi +de +onlardan +sizi +sizin +daha +niçin +þunda +INSERmi +bunu +beni +ile +þu +þeyi +sizden +defa +biz +için +dahi +siz +nerde +kime +birþey +birkez +her +biri +on +mü +diye +acaba +sen +en +hepsi +bir +bizden +sanki +benim +nerede +onu +benden +yüz +birkaç +çünkü +nasýl +hiç +katrilyon diff --git a/src/mongo/db/fts/tokenizer.cpp b/src/mongo/db/fts/tokenizer.cpp new file mode 100644 index 00000000000..73f485901f6 --- /dev/null +++ b/src/mongo/db/fts/tokenizer.cpp @@ -0,0 +1,129 @@ +// tokenizer.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include <string> + +#include "mongo/db/fts/tokenizer.h" +#include "mongo/util/stringutils.h" + +namespace mongo { + + namespace fts { + + Tokenizer::Tokenizer( const string& language, const StringData& str ) + : _pos(0), _raw( str ) { + _english = language == "english"; + _skipWhitespace(); + _previousWhiteSpace = true; + } + + bool Tokenizer::more() const { + return _pos < _raw.size(); + } + + Token Tokenizer::next() { + if ( _pos >= _raw.size() ) + return Token( Token::INVALID, "", 0, false ); + + unsigned start = _pos++; + Token::Type type = _type( _raw[start] ); + if ( type == Token::WHITESPACE ) abort(); + + if ( type == Token::TEXT ) + while ( _pos < _raw.size() && _type( _raw[_pos] ) == type ) + _pos++; + + StringData ret = _raw.substr( start, _pos - start ); + bool old = _previousWhiteSpace; + _previousWhiteSpace = _skipWhitespace(); + return Token( type, ret, start, old ); + } + + + bool Tokenizer::_skipWhitespace() { + unsigned start = _pos; + while ( _pos < _raw.size() && _type( _raw[_pos] ) == Token::WHITESPACE ) + _pos++; + return _pos > start; + } + + + Token::Type Tokenizer::_type( char c ) const { + switch ( c ) { + case ' ': + case '\f': + case '\v': + case '\t': + case '\r': + case '\n': + return Token::WHITESPACE; + case '\'': + if ( _english ) + return Token::TEXT; + else + return Token::WHITESPACE; + + case '~': + case '`': + + case '!': + case '@': + case '#': + case '$': + case '%': + case '^': + case '&': + case '*': + case '(': + case ')': + + case '-': + + case '=': + case '+': + + case '[': + case ']': + case '{': + case '}': + case '|': + case '\\': + + case ';': + case ':': + + case '"': + + case '<': + case '>': + + case ',': + case '.': + + case '/': + case '?': + + return Token::DELIMITER; + default: + return Token::TEXT; + } + } + + } + +} diff --git a/src/mongo/db/fts/tokenizer.h b/src/mongo/db/fts/tokenizer.h new file mode 100644 index 00000000000..5b9a56ed8d6 --- /dev/null +++ b/src/mongo/db/fts/tokenizer.h @@ -0,0 +1,68 @@ +// tokenizer.h + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + + +#pragma once + +#include <string> + +#include "mongo/base/string_data.h" +#include "mongo/platform/unordered_map.h" +#include "mongo/platform/unordered_set.h" + +namespace mongo { + + namespace fts { + + struct Token { + enum Type { WHITESPACE, DELIMITER, TEXT, INVALID }; + Token( Type type, const StringData& data, unsigned offset, bool previousWhiteSpace ) + : type( type ), + data( data ), + offset( offset ), + previousWhiteSpace( previousWhiteSpace ) {} + + bool ok() const { return type != INVALID; } + + Type type; + StringData data; + unsigned offset; + bool previousWhiteSpace; + }; + + class Tokenizer { + public: + + Tokenizer( const std::string& language, const StringData& str ); + + bool more() const; + Token next(); + + private: + Token::Type _type( char c ) const; + bool _skipWhitespace(); + + unsigned _pos; + bool _previousWhiteSpace; + const StringData& _raw; + bool _english; + }; + + } +} + diff --git a/src/mongo/db/fts/tokenizer_test.cpp b/src/mongo/db/fts/tokenizer_test.cpp new file mode 100644 index 00000000000..1502b2f4390 --- /dev/null +++ b/src/mongo/db/fts/tokenizer_test.cpp @@ -0,0 +1,119 @@ +// tokenizer_test.cpp + +/** +* Copyright (C) 2012 10gen Inc. +* +* This program is free software: you can redistribute it and/or modify +* it under the terms of the GNU Affero General Public License, version 3, +* as published by the Free Software Foundation. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU Affero General Public License for more details. +* +* You should have received a copy of the GNU Affero General Public License +* along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#include "mongo/db/fts/tokenizer.h" +#include "mongo/unittest/unittest.h" + +namespace mongo { + namespace fts { + + TEST( Tokenizer, Empty1 ) { + Tokenizer i( "english", "" ); + ASSERT( !i.more() ); + } + + TEST( Tokenizer, Basic1 ) { + Tokenizer i( "english", "blue red green" ); + + ASSERT( i.more() ); + ASSERT_EQUALS( i.next().data.toString(), "blue" ); + + ASSERT( i.more() ); + ASSERT_EQUALS( i.next().data.toString(), "red" ); + + ASSERT( i.more() ); + ASSERT_EQUALS( i.next().data.toString(), "green" ); + + ASSERT( !i.more() ); + } + + TEST( Tokenizer, Basic2 ) { + Tokenizer i( "english", "blue-red" ); + + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); + Token d = i.next(); + + ASSERT_EQUALS( Token::TEXT, a.type ); + ASSERT_EQUALS( Token::DELIMITER, b.type ); + ASSERT_EQUALS( Token::TEXT, c.type ); + ASSERT_EQUALS( Token::INVALID, d.type ); + + ASSERT_EQUALS( "blue", a.data.toString() ); + ASSERT_EQUALS( "-", b.data.toString() ); + ASSERT_EQUALS( "red", c.data.toString() ); + + ASSERT( a.previousWhiteSpace ); + ASSERT( !b.previousWhiteSpace ); + ASSERT( !c.previousWhiteSpace ); + } + + TEST( Tokenizer, Basic3 ) { + Tokenizer i( "english", "blue -red" ); + + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); + Token d = i.next(); + + ASSERT_EQUALS( Token::TEXT, a.type ); + ASSERT_EQUALS( Token::DELIMITER, b.type ); + ASSERT_EQUALS( Token::TEXT, c.type ); + ASSERT_EQUALS( Token::INVALID, d.type ); + + ASSERT_EQUALS( "blue", a.data.toString() ); + ASSERT_EQUALS( "-", b.data.toString() ); + ASSERT_EQUALS( "red", c.data.toString() ); + + ASSERT( a.previousWhiteSpace ); + ASSERT( b.previousWhiteSpace ); + ASSERT( !c.previousWhiteSpace ); + + + ASSERT_EQUALS( 0U, a.offset ); + ASSERT_EQUALS( 5U, b.offset ); + ASSERT_EQUALS( 6U, c.offset ); + } + + TEST( Tokenizer, Quote1English ) { + Tokenizer i( "english", "eliot's car" ); + + Token a = i.next(); + Token b = i.next(); + + ASSERT_EQUALS( "eliot's", a.data.toString() ); + ASSERT_EQUALS( "car", b.data.toString() ); + } + + TEST( Tokenizer, Quote1French ) { + Tokenizer i( "french", "eliot's car" ); + + Token a = i.next(); + Token b = i.next(); + Token c = i.next(); + + ASSERT_EQUALS( "eliot", a.data.toString() ); + ASSERT_EQUALS( "s", b.data.toString() ); + ASSERT_EQUALS( "car", c.data.toString() ); + } + + } +} + + |