001/*- 002 * #%L 003 * HAPI FHIR JPA Server 004 * %% 005 * Copyright (C) 2014 - 2024 Smile CDR, Inc. 006 * %% 007 * Licensed under the Apache License, Version 2.0 (the "License"); 008 * you may not use this file except in compliance with the License. 009 * You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 * #L% 019 */ 020package ca.uhn.fhir.jpa.search; 021 022import org.apache.lucene.analysis.core.KeywordTokenizerFactory; 023import org.apache.lucene.analysis.core.LowerCaseFilterFactory; 024import org.apache.lucene.analysis.core.StopFilterFactory; 025import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory; 026import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory; 027import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory; 028import org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory; 029import org.apache.lucene.analysis.ngram.NGramFilterFactory; 030import org.apache.lucene.analysis.pattern.PatternTokenizerFactory; 031import org.apache.lucene.analysis.phonetic.PhoneticFilterFactory; 032import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; 033import org.apache.lucene.analysis.standard.StandardTokenizerFactory; 034import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurationContext; 035import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer; 036import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurationContext; 037import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurer; 038import org.springframework.stereotype.Component; 039 040import static ca.uhn.fhir.jpa.model.search.SearchParamTextPropertyBinder.LOWERCASE_ASCIIFOLDING_NORMALIZER; 041 042/** 043 * Class includes configuration classes for both Lucene and Elasticsearch as they usually need to be updated 044 * simultaneously, and otherwise is very easy to miss the second 045 */ 046@Component 047public class HapiHSearchAnalysisConfigurers { 048 049 /** 050 * Factory for defining the analysers. 051 */ 052 public static class HapiLuceneAnalysisConfigurer implements LuceneAnalysisConfigurer { 053 054 public static final String STANDARD_ANALYZER = "standardAnalyzer"; 055 public static final String NORM_STRING_ANALYZER = "normStringAnalyzer"; 056 public static final String EXACT_ANALYZER = "exactAnalyzer"; 057 058 @Override 059 public void configure(LuceneAnalysisConfigurationContext theLuceneCtx) { 060 theLuceneCtx 061 .analyzer("autocompleteEdgeAnalyzer") 062 .custom() 063 .tokenizer(PatternTokenizerFactory.NAME) 064 .param(PatternTokenizerFactory.PATTERN, "(.*)") 065 .param(PatternTokenizerFactory.GROUP, "1") 066 .tokenFilter(LowerCaseFilterFactory.NAME) 067 .tokenFilter(StopFilterFactory.NAME) 068 .tokenFilter(EdgeNGramFilterFactory.NAME) 069 .param("minGramSize", "3") 070 .param("maxGramSize", "50"); 071 072 theLuceneCtx 073 .analyzer("autocompletePhoneticAnalyzer") 074 .custom() 075 .tokenizer(StandardTokenizerFactory.NAME) 076 .tokenFilter(StopFilterFactory.NAME) 077 .tokenFilter(PhoneticFilterFactory.NAME) 078 .param(PhoneticFilterFactory.ENCODER, "DoubleMetaphone") 079 .tokenFilter(SnowballPorterFilterFactory.NAME) 080 .param("language", "English"); 081 082 theLuceneCtx 083 .analyzer("autocompleteNGramAnalyzer") 084 .custom() 085 .tokenizer(StandardTokenizerFactory.NAME) 086 .tokenFilter(WordDelimiterGraphFilterFactory.NAME) 087 .tokenFilter(LowerCaseFilterFactory.NAME) 088 .tokenFilter(NGramFilterFactory.NAME) 089 .param("minGramSize", "3") 090 .param("maxGramSize", "20"); 091 092 theLuceneCtx 093 .analyzer("autocompleteWordEdgeAnalyzer") 094 .custom() 095 .tokenizer(StandardTokenizerFactory.NAME) 096 .tokenFilter(LowerCaseFilterFactory.NAME) 097 .tokenFilter(StopFilterFactory.NAME) 098 .tokenFilter(EdgeNGramFilterFactory.NAME) 099 .param("minGramSize", "3") 100 .param("maxGramSize", "20"); 101 102 theLuceneCtx 103 .analyzer(STANDARD_ANALYZER) 104 .custom() 105 .tokenizer(StandardTokenizerFactory.NAME) 106 .tokenFilter(LowerCaseFilterFactory.NAME) 107 .tokenFilter(ASCIIFoldingFilterFactory.NAME); 108 109 theLuceneCtx 110 .analyzer(NORM_STRING_ANALYZER) 111 .custom() 112 .tokenizer(KeywordTokenizerFactory.NAME) 113 .tokenFilter(LowerCaseFilterFactory.NAME) 114 .tokenFilter(ASCIIFoldingFilterFactory.NAME); 115 116 theLuceneCtx.analyzer(EXACT_ANALYZER).custom().tokenizer(KeywordTokenizerFactory.NAME); 117 118 theLuceneCtx.analyzer("conceptParentPidsAnalyzer").custom().tokenizer(WhitespaceTokenizerFactory.NAME); 119 120 theLuceneCtx 121 .normalizer(LOWERCASE_ASCIIFOLDING_NORMALIZER) 122 .custom() 123 .tokenFilter(LowerCaseFilterFactory.NAME) 124 .tokenFilter(ASCIIFoldingFilterFactory.NAME); 125 } 126 } 127 128 public static class HapiElasticsearchAnalysisConfigurer implements ElasticsearchAnalysisConfigurer { 129 130 @Override 131 public void configure(ElasticsearchAnalysisConfigurationContext theConfigCtx) { 132 133 theConfigCtx 134 .analyzer("autocompleteEdgeAnalyzer") 135 .custom() 136 .tokenizer("pattern_all") 137 .tokenFilters("lowercase", "stop", "edgengram_3_50"); 138 139 theConfigCtx 140 .tokenizer("pattern_all") 141 .type("pattern") 142 .param("pattern", "(.*)") 143 .param("group", "1"); 144 145 theConfigCtx 146 .tokenFilter("edgengram_3_50") 147 .type("edge_ngram") 148 .param("min_gram", "3") 149 .param("max_gram", "50"); 150 151 theConfigCtx 152 .analyzer("autocompleteWordEdgeAnalyzer") 153 .custom() 154 .tokenizer("standard") 155 .tokenFilters("lowercase", "stop", "wordedgengram_3_50"); 156 157 theConfigCtx 158 .tokenFilter("wordedgengram_3_50") 159 .type("edge_ngram") 160 .param("min_gram", "3") 161 .param("max_gram", "20"); 162 163 theConfigCtx 164 .analyzer("autocompletePhoneticAnalyzer") 165 .custom() 166 .tokenizer("standard") 167 .tokenFilters("stop", "snowball_english"); 168 169 theConfigCtx.tokenFilter("snowball_english").type("snowball").param("language", "English"); 170 171 theConfigCtx 172 .analyzer("autocompleteNGramAnalyzer") 173 .custom() 174 .tokenizer("standard") 175 .tokenFilters("word_delimiter", "lowercase", "ngram_3_20"); 176 177 theConfigCtx 178 .tokenFilter("ngram_3_20") 179 .type("ngram") 180 .param("min_gram", "3") 181 .param("max_gram", "20"); 182 183 theConfigCtx 184 .analyzer(HapiLuceneAnalysisConfigurer.STANDARD_ANALYZER) 185 .custom() 186 .tokenizer("standard") 187 .tokenFilters("lowercase", "asciifolding"); 188 189 theConfigCtx 190 .analyzer(HapiLuceneAnalysisConfigurer.NORM_STRING_ANALYZER) 191 .custom() 192 .tokenizer("keyword") // We need the whole string to match, including whitespace. 193 .tokenFilters("lowercase", "asciifolding"); 194 195 theConfigCtx.analyzer("exactAnalyzer").custom().tokenizer("keyword").tokenFilters("unique"); 196 197 theConfigCtx.analyzer("conceptParentPidsAnalyzer").custom().tokenizer("whitespace"); 198 199 theConfigCtx 200 .normalizer(LOWERCASE_ASCIIFOLDING_NORMALIZER) 201 .custom() 202 .tokenFilters("lowercase", "asciifolding"); 203 } 204 } 205}