solr fuzzy search with edit distance above 1 - solr

Enviornment- java version "11.0.12" 2021-07-20 LTS, solr-8.9.0
I have the following field declaration for my Solr index:
<field name="Field1" type="string" multiValued="false" indexed="false" stored="true"/>
<field name="author" type="text_general" multiValued="false" indexed="true" stored="true"/>
<field name="Field2" type="string" multiValued="false" indexed="false" stored="true"/>
Field type:
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
Solr-core has been created using command : ./solr create -c fuzzyCore
The .csv file used to indexed the data is https://drive.google.com/file/d/1z684x2GKsSQWGAdyi6O4uKit4a96iiuh/view
I understand that "Lucene supports fuzzy searches based on the Levenshtein Distance, or Edit Distance algorithm. To do a fuzzy search the tilde, "~", symbol at the end of a Single word Term is used.
~ operator is used to run fuzzy searches. We need to add ~ operator after every single term and can also specify distance which is optional after that as below."
{FIELD_NAME:TERM_1~{Edit_Distance}
Since 'KeywordTokenizer' keeps the whole input as a single token and I want each word to be searchable, so 'StandardTokenizer' is used.
request looks like as mentioned below :
curl "http://localhost:8983/solr/fuzzyCore/select" --data-urlencode "q=author:beaeb~' AND Field1:(w1 x)" --data-urlencode "rows=20"
{
"responseHeader":{
"status":0,
"QTime":14,
"params":{
"q":"author:beaeb~' AND Field1:(w1 x)",
"rows":"20"}},
"response":{"numFound":12,"start":0,"numFoundExact":true,"docs":[
{
"Field1":"x",
"author":"bbaeb",
"Field2":"o",
"id":"f8fbb58d-9e0d-47b2-aa3c-e3920e25a7d1",
"_version_":1746912583192936455},
{
"Field1":"x",
"author":"beabe",
"Field2":"p",
"id":"7d73e7ba-8455-4eb4-818f-1e19b1d35a22",
"_version_":1746912583244316680},
{
"Field1":"x",
"author":"baeeb",
"Field2":"n",
"id":"b4e86fc3-7ecc-407b-b638-88d167a66934",
"_version_":1746912583292551181},
{
"Field1":"x",
"author":"beaea",
"Field2":"o",
"id":"131ad4de-eaa2-47b8-b58b-e690316eed1c",
"_version_":1746912583314571267},
{
"Field1":"x",
"author":"bbaeb",
"Field2":"q",
"id":"d034e66c-a302-4b24-a186-5a2bafecab40",
"_version_":1746912583392165900},
{
"Field1":"x",
"author":"beacb",
"Field2":"n",
"id":"c0ab3e48-2b2d-438d-8cc2-1acfcf6efde8",
"_version_":1746912583490732036},
{
"Field1":"x",
"author":"aeabe",
"Field2":"m",
"id":"4472ec5d-eace-446f-b1d6-c8911be24368",
"_version_":1746912583266336776},
{
"Field1":"x",
"author":"baeab",
"Field2":"q",
"id":"b4c24da3-9199-4eba-a8a3-e30fc17d9167",
"_version_":1746912583274725377},
{
"Field1":"x",
"author":"aeaea",
"Field2":"n",
"id":"bb17bc26-e392-4fed-ae46-bbdd40af0ac0",
"_version_":1746912583294648329},
{
"Field1":"x",
"author":"aeceb",
"Field2":"p",
"id":"5e5cfe21-ff19-464f-8adf-8b5888c418e4",
"_version_":1746912583296745472},
{
"Field1":"x",
"author":"baeab",
"Field2":"p",
"id":"54a3c8e6-137d-47c3-9192-a5ed1904dc55",
"_version_":1746912583357562889},
{
"Field1":"x",
"author":"aeeeb",
"Field2":"m",
"id":"200694a0-6248-49fd-8182-dac79657e045",
"_version_":1746912583385874444}]
}}
,
The above request is not retrieving output as 'author:bebbeb',although there is author:'bebbeb' is present in data with Field1:w1. This can be
verified with following two commands
curl "http://localhost:8983/solr/fuzzyCore/select" --data-urlencode "q=author:beaeb~' AND Field1:w1"
{
"responseHeader":{
"status":0,
"QTime":4,
"params":{
"q":"author:beaeb~' AND Field1:w1"}},
"response":{"numFound":0,"start":0,"numFoundExact":true,"docs":[]
}}
Although output of following command is
curl "http://localhost:8983/solr/fuzzyCore/select" --data-urlencode "q=Field1:w1"
{
"responseHeader":{
"status":0,
"QTime":1,
"params":{
"q":"Field1:w1"}},
"response":{"numFound":1,"start":0,"numFoundExact":true,"docs":[
{
"Field1":"w1",
"author":"bebbeb",
"Field2":"p",
"id":"4356dff2-ab93-4bab-a4dc-1797db38240c",
"_version_":1746912583504363523}]
}}
so I tried to post everything you need to understand my problem. Any ideas? Why author:'bebbeb' is not resulting as output for input:beaeb~

After debugging Lucene we discovered that there is a parameter called maxExpansions set to 50 by default, which could be extended to 1024.
However, looking at the Solr code, we can see that the FuzzyQuery constructor is only called twice and always uses the default maxExpansions value (for performance reasons); this means fuzzy searches take at most the 50 most similar terms and discard the others. That's why when many documents are indexed and most of the terms are similar (as in your case), some documents may not be returned.
A Solr open-source contribution would be needed to expose this parameter and make the use of this feature more flexible (allowing different values to be set).

Related

How to use Solr MinHashQParser

Currently I'm trying to integrate Jaccard similarity search using MinHash and I stumbled upon solr's 8.11 MinHash Query Parser and it says in the docs:
The queries measure Jaccard similarity between the query string and MinHash fields
How to correctly implement it?
As docs say, I added <fieldType> and <field> like so:
<field name="min_hash_analysed" type="text_min_hash" multiValued="false" indexed="true" stored="false" />
<fieldType name="text_min_hash" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.ICUTokenizerFactory"/>
<filter class="solr.ICUFoldingFilterFactory"/>
<filter class="solr.ShingleFilterFactory" minShingleSize="5" outputUnigrams="false" outputUnigramsIfNoShingles="false" maxShingleSize="5" tokenSeparator=" "/>
<filter class="org.apache.lucene.analysis.minhash.MinHashFilterFactory" bucketCount="512" hashSetSize="1" hashCount="1"/>
</analyzer>
</fieldType>
I tired saving some text to that new min_hash_analysed field and then trying to query very similar text using query provided in the doc.
{!min_hash field="min_hash_analysed" sim="0.5" tp="0.5"}Very similar text to already saved document text
I was hoping to get back all documents that have higher similarity score than sim="0.5", but no matter what I get "numFound":0
Surely I'm doing some thing wrong. How should I correctly integrate Solr's MinHash Query Parser?
According to the response it seems you're sending {!min_hash field..} directly as a query parameter, not as a Solr query as given by the the q= parameter.
q={!min_hash ..}query text here
.. would be the correct syntax in the URL (and apply URL escaping as required).

Solr tokenizer does not do anything

I want to tokenize one solr string field "content" to another field "tokenized".
So e.g.:
{
"content":"Hello World this is a Test",
"tokenized":["hello", "world", "this", ...]
}
For that i use
<field name="content" type="string" indexed="true" stored="true"/>
<field name="tokenized" type="customType" indexed="true" stored="true"/>
<copyField source="content" dest="tokenized"/>
and the custom field type
<fieldType name="customType" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
My understanding was that upon committing all contents are tokenized with the specified tokenizer and then put, as a list of tokens, into the tokenized field. However the tokenized field only contains the content in a list, e.g.:
{
"content":"Hello World this is a Test",
"tokenized":["Hello World this is a Test"]
}
Is there some global configuration i need to make to get tokenizers to work?
Tokens are only stored internally in Lucene and Solr. They do not change the stored text that gets returned to you in any way. The text is stored verbatim - i.e. the text you sent in is what gets returned to you.
The tokens generated in the background and stored in the index affect how you can search against the content you've stored and how it's processed, it does not affect the display value of the field.
You can use the Analysis page under Solr's admin page to see exactly how text for a field gets processed into tokens before being stored in the index.
The reason for this is that you're usually interested in returning the actual text to the user, making the tokenized and processed values visible doesn't really make sense for a document that gets returned to a human.

Sitecore 9 Indexing : Solr Pattern Tokenizer not Working

I'm new with this combination sitecore and solr stuff.. I've a little issue with the pattern tokenizer which is not working.. I'm following this documentation
Solr :
https://lucene.apache.org/solr/guide/6_6/tokenizers.html#Tokenizers-RegularExpressionPatternTokenizer)
Sitecore 9 Solr :
https://doc.sitecore.net/sitecore_experience_platform/setting_up_and_maintaining/search_and_indexing/using_solr_field_name_resolution
When I do the indexing, my field value is : a,b,c and I expected on solr it will be ["a","b","c"] but it contains ["a,b,c"]
This is my Sitecore Config
<fieldMap>
<typeMatches hint="raw:AddTypeMatch">
<typeMatch type="System.Collections.Generic.List`1[System.String]" typeName="commaDelimitedCollection" fieldNameFormat="{0}_cd"
multiValued="true" settingType="Sitecore.ContentSearch.SolrProvider.SolrSearchFieldConfiguration, Sitecore.ContentSearch.SolrProvider"/>
</typeMatches>
<fieldNames hint="raw:AddFieldByFieldName">
<field fieldName="Keywords" returnType="commaDelimitedCollection"/>
</fieldNames>
</fieldMap>
This is my Solr Schema
<fieldType name="commaDelimited" class="solr.TextField" multiValued="true">
<analyzer>
<tokenizer class="solr.PatternTokenizerFactory" pattern="\s*,\s*"/>
</analyzer>
</fieldType>
<dynamicField name="*_cd" type="commaDelimited" multiValued="true" indexed="true" stored="true"/>
Any idea what's wrong with my configuration above?
Thanks
Not sure if I get the full picture here. Maybe your approach is perfectly valid, but I don't think I've seen that one before. Instead of defining a new type, you could reuse the *_sm (multiValued string) and perform the splitting of the string at index time on the Sitecore side. Usually you don't need more field types than the ones provided by sitecore and it's typically easier to maintain all the code in your VS solution instead of depending on additional Solr config. (In Sitecore 9 you can deploy your Solr managed schema from the control panel though.)
A simple computed field field can look like this:
<fields hint="raw:AddComputedIndexField">
<field fieldName="keywords" returnType="stringCollection">
Your.Name.Space.YourComputedFieldClass, YourAssembly
</field>
</fields>
And a class implementation could look something like this:
public class YourComputedFieldClass : IComputedIndexField
{
public object ComputeFieldValue(IIndexable indexable)
{
var item = indexable as SitecoreIndexableItem;
var fieldValue = item?.Item?["Keywords"]
if (string.IsNullOrWhitespace(fieldValue)) {
return null;
}
return fieldValue.Split(',');
}
public string FieldName { get; set; }
public string ReturnType { get; set; }
}

Pattern Tokenizer Factory doesn't work properly

I'm trying parse input line using PatternTokenizerFactory.
So according to doc:
https://lucene.apache.org/core/4_1_0/analyzers-common/org/apache/lucene/analysis/pattern/PatternTokenizerFactory.html
My schema looks like:
<fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.PatternTokenizerFactory" pattern="("bbb": ")([[a-zA-Z ]+)" group="2"/>
</analyzer>
</fieldType>
So, this pattern should work: https://regex101.com/r/9Ep6qO/6
According to schema I'm trying to get value of from particular part of the "test" field ('bbb'). As I understand now I can search doc just writing in Solr "test":"Acc Hs"
But I can search only using such construction: "test":"'bbb': 'Acc Hs'"
My solution was to split this input and then use the filter:
<tokenizer class="solr.PatternTokenizerFactory" pattern="(.*\"bbb\": \")" />
<filter class="solr.PatternCaptureGroupFilterFactory"
pattern="(^[a-zA-Z ]+)"
preserve_original="false"/>
So, could you explain why the first option isn't working.(There were no difference when I put e.g. group="1")

Solr - Copy tokenized multi valued field to a string

I have a field which is tokenized and multivalued with many values. I want to use these terms as facets. When I try to search by the term I receive an issue where I search for the term "not necessarily" (we see there is a space) and I get the results for the term "not". This leads me to believe that I can not use a tokenized field as a facet field. I tried to copy the values of the field to a text field with a keywordtokenizer. I am told when checking the schema browser: "Sorry, no Term Info available :(" This is after I delete the old index and upload the documents again. What am I doing wrong here?
<copyField source="ColonCancerField" dest="cytokineField"/>
<field name="cytokineField" indexed="true" stored="true" multiValued="true" type="Cytokine_Pass"/>
<fieldType name="Cytokine_Pass" class="solr.TextField">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory" />
</analyzer>
</fieldType>
<field name="ColonCancerField" type="ColonCancer" indexed="true" stored="true" multiValued="true"
termPositions="true"
termVectors="true"
termOffsets="true"/>
<fieldType name="ColonCancer" class="solr.TextField" sortMissingLast="true" omitNorms="true">
<analyzer>
<filter class="solr.ShingleFilterFactory"
minShingleSize="2" maxShingleSize="5"
outputUnigramsIfNoShingles="true"
/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms_ColonCancer.txt" ignoreCase="true" expand="true" tokenizerFactory="solr.KeywordTokenizerFactory"/>
<filter class="solr.KeepWordFilterFactory"
words="prefLabels_ColonCancer.txt" ignoreCase="true"/>
</analyzer>
</fieldType>
<copyField source="content" dest="ColonCancerField"/>
Most likely, something goes wrong between your synonyms and your KeepWord list. In the Admin UI, there is Analyse screen, where you can put your original text and your search string and see both go through the analysis using the chosen field (ColonCancerField for you). The matches would be highlighted.
That's for the search. For the facets, using ColonCancerField should have worked for the facets, try it without any search (q=*:*).
The KeywordTokenizer does not split the text at all, so you should get the whole field as a facet.
Also, the copyText directives do NOT layer. So you cannot copy field1 to field2 and then field2 to field3. You need to copy field1 to both field2 and field3. That's probably part of your issue here.
I decided to add a field (cancerTerms) and add in the terms there. But I am not doing this outside of Solr. I am using the analysis chain and passing it through a ScriptUpdateProcessor. Here I can take the results of the analysis chain
and store the results to the document (as as strField). Then I facet on this field (cancerTerms). This actually gives me the correct results, it does not give me issues with the not and not necessarily or any other similar issue. Also I am not storing the the analysis chain field (I was previously). It makes no sense to store this because it was a copy field (apparently copy fields only copy the source text then pipe it to the analyzer, and cannot be chained). I am only storing the results of the chain (which is useful for faceting).
Here is a simplified view as to what I am doing:
Content [is copied to] -> ColonCancerField (analysis chain [not stored,
and will produce tokenized strings]) ->Passed to update-script (processed
each token as string) [added to] -> CancerTerms (strField)
id: 2040ee23-c5dc-459c-969f-2ebf6c728184
title: Immune profile modulation of blood and mucosal eosinophils in nasal polyposis with concomitant asthma.
content: BACKGROUND: Chronic rhinosinusitis with nasal polyps (CRSwNP) is frequently associated with asthma. Mucosal eosinophil (EO)
infiltrate has been found to correlate with asthma and disease
severity but not necessarily in every patient. Other multifactorial
immune processes are required to determine disease endotypes and
response to treatment. OBJECTIVE: To evaluate EO immunomodulation for
migration and survival in accordance with inflammatory protein
profiles and asthmatic status in CRSwNP. METHODS: Ninety-three
patients (47 with asthma) with CRSwNP were included. Each patient was
staged clinically according to symptom severity and polyp size. Nasal
secretions were collected to establish a cytokine profile. The EOs
were purified from blood samples and nasal polyps to delineate
specific immunophenotypes by flow cytometry and determine in vitro EO
survival in relation to asthmatic status. RESULTS: The CRSwNP in
patients with asthma was characterized by eosinophilia and a high
level of interleukin (IL)-5 in nasal secretions. Although EOs
exhibited activation profiles after mucosal migration, there was
relative down-expression of IL-5 receptor-? (IL-5R?) on nasal EOs in
patients with asthma. The EO culture with IL-5 and IL-9 showed an
antiapoptotic effect in patients with asthma through IL-5R?
modulation. CONCLUSION: Mucosal eosinophilia seems to be induced by EO
nasal trapping through modulation of adhesion receptors. In patients
with asthma, EO involvement is enhanced by the antiapoptotic
synergistic action of T-helper cell type 2 cytokines on IL-5R?
expression. This study shows for the first time that IL-9 is involved
in EO homeostasis in CRSwNP and could explain the low benefit of
anti-IL-5 therapy for some patients with asthma and nasal polyposis.
cytokineTerms: t cell replacing factortype ii interferonc7chemokineinterleukin 17 precursorleukocyte
mediatorinterleukinst cell replacing factort cell replacing factoril9
proteininterferon alpha-5cytokinesil9 protein
cancerTerms: butnotnot necessarilyalthough
version: 1522116540216901632
score: 1.0
Here is some of the code (please forgive the mess. I have included changes for Solr ver. 5):
UpdateScript
/***************************UpdateScript*********************************/
function getAnalyzerResult(analyzer, fieldName, fieldValue) {
var result = [];
var token_stream = analyzer.tokenStream(fieldName, new java.io.StringReader(fieldValue));//null value?
var term_att = token_stream.getAttribute(Packages.org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
token_stream.reset();
while (token_stream.incrementToken()) {
result.push(term_att.toString());
}
token_stream.end();
token_stream.close();
return result;
}
function processAdd(cmd) {
doc = cmd.solrDoc; // org.apache.solr.common.SolrInputDocument
id = doc.getFieldValue("id");
logger.warn("update-script#processAdd: id=" + id);
var content = doc.getFieldValue("content"); // Comes from /update/extract
//facetList contains the actual facet terms
//facetAnalyzerName contains the Analyzer name for the term vector list names. (i.e the field type)
var facetList = ["cytokineTerms", "cancerTerms"];
var facetAnalyzerName = ["key_phrases", "ColonCancer"];
/*
Loop through all of the facets, and get the analyzer and the name for the field
Then add the terms to the document
*/
for(var i = 0; i < facetList.length; i++){
var analyzer = req.getCore().getLatestSchema().getFieldTypeByName(facetAnalyzerName[i]).getIndexAnalyzer();
var terms = getAnalyzerResult(analyzer, null, content);
for(var index = 0; index < terms.length; index++){
doc.addField(facetList[i], terms[index]);
}
}
}
// The functions below must be defined, but there's rarely a need to implement
// anything in these.
function processDelete(cmd) {
// no-op
}
function processMergeIndexes(cmd) {
// no-op
}
function processCommit(cmd) {
// no-op
}
function processRollback(cmd) {
// no-op
}
function finish() {
// no-op
}
/***************************UpdateScript*********************************/
updateRequestProcessorChain
/****************updateRequestProcessorChain ***********************/
<updateRequestProcessorChain name="script" default="true">
<processor class="solr.StatelessScriptUpdateProcessorFactory">
<str name="script">update-script.js</str>
<lst name="params">
<str name="config_param">example config parameter</str>
</lst>
</processor>
<processor class="solr.LogUpdateProcessorFactory"/>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
/****************updateRequestProcessorChain ***********************/
Upload using post
java -Durl=http://localhost:8983/solr/Cytokine/update -Dauto -Dparams=update.chain=script -jar bin/post.jar C:/Users/Kevin/Downloads/pubmed_result.json
Sources:
http://lucidworks.com/blog/2013/06/27/poor-mans-entity-extraction-with-solr/
https://www.youtube.com/watch?v=AXSK2RvVJsk
https://wiki.apache.org/solr/ScriptUpdateProcessor
4.https://lucene.apache.org/solr/5_0_0/changes/Changes.html#v5.0.0.upgrading_from_solr_4.x
https://gist.github.com/erikhatcher/50e653c1c09abb68e068
Archive:
https://mail-archives.apache.org/mod_mbox/lucene-solr-user/201512.mbox/%3CCAH57+p4FK=Ta84dEpUR4p0xWQ2YWkOWPpj566ZZzhdjW9F_ZJg#mail.gmail.com%3E

Resources