001/*-
002 * #%L
003 * HAPI FHIR JPA Server
004 * %%
005 * Copyright (C) 2014 - 2024 Smile CDR, Inc.
006 * %%
007 * Licensed under the Apache License, Version 2.0 (the "License");
008 * you may not use this file except in compliance with the License.
009 * You may obtain a copy of the License at
010 *
011 *      http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 * #L%
019 */
020package ca.uhn.fhir.jpa.search;
021
022import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
023import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
024import org.apache.lucene.analysis.core.StopFilterFactory;
025import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
026import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
027import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory;
028import org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory;
029import org.apache.lucene.analysis.ngram.NGramFilterFactory;
030import org.apache.lucene.analysis.pattern.PatternTokenizerFactory;
031import org.apache.lucene.analysis.phonetic.PhoneticFilterFactory;
032import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
033import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
034import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurationContext;
035import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer;
036import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurationContext;
037import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurer;
038import org.springframework.stereotype.Component;
039
040import static ca.uhn.fhir.jpa.model.search.SearchParamTextPropertyBinder.LOWERCASE_ASCIIFOLDING_NORMALIZER;
041
042/**
043 * Class includes configuration classes for both Lucene and Elasticsearch as they usually need to be updated
044 * simultaneously, and otherwise is very easy to miss the second
045 */
046@Component
047public class HapiHSearchAnalysisConfigurers {
048
049        /**
050         * Factory for defining the analysers.
051         */
052        public static class HapiLuceneAnalysisConfigurer implements LuceneAnalysisConfigurer {
053
054                public static final String STANDARD_ANALYZER = "standardAnalyzer";
055                public static final String NORM_STRING_ANALYZER = "normStringAnalyzer";
056                public static final String EXACT_ANALYZER = "exactAnalyzer";
057
058                @Override
059                public void configure(LuceneAnalysisConfigurationContext theLuceneCtx) {
060                        theLuceneCtx
061                                        .analyzer("autocompleteEdgeAnalyzer")
062                                        .custom()
063                                        .tokenizer(PatternTokenizerFactory.NAME)
064                                        .param(PatternTokenizerFactory.PATTERN, "(.*)")
065                                        .param(PatternTokenizerFactory.GROUP, "1")
066                                        .tokenFilter(LowerCaseFilterFactory.NAME)
067                                        .tokenFilter(StopFilterFactory.NAME)
068                                        .tokenFilter(EdgeNGramFilterFactory.NAME)
069                                        .param("minGramSize", "3")
070                                        .param("maxGramSize", "50");
071
072                        theLuceneCtx
073                                        .analyzer("autocompletePhoneticAnalyzer")
074                                        .custom()
075                                        .tokenizer(StandardTokenizerFactory.NAME)
076                                        .tokenFilter(StopFilterFactory.NAME)
077                                        .tokenFilter(PhoneticFilterFactory.NAME)
078                                        .param(PhoneticFilterFactory.ENCODER, "DoubleMetaphone")
079                                        .tokenFilter(SnowballPorterFilterFactory.NAME)
080                                        .param("language", "English");
081
082                        theLuceneCtx
083                                        .analyzer("autocompleteNGramAnalyzer")
084                                        .custom()
085                                        .tokenizer(StandardTokenizerFactory.NAME)
086                                        .tokenFilter(WordDelimiterGraphFilterFactory.NAME)
087                                        .tokenFilter(LowerCaseFilterFactory.NAME)
088                                        .tokenFilter(NGramFilterFactory.NAME)
089                                        .param("minGramSize", "3")
090                                        .param("maxGramSize", "20");
091
092                        theLuceneCtx
093                                        .analyzer("autocompleteWordEdgeAnalyzer")
094                                        .custom()
095                                        .tokenizer(StandardTokenizerFactory.NAME)
096                                        .tokenFilter(LowerCaseFilterFactory.NAME)
097                                        .tokenFilter(StopFilterFactory.NAME)
098                                        .tokenFilter(EdgeNGramFilterFactory.NAME)
099                                        .param("minGramSize", "3")
100                                        .param("maxGramSize", "20");
101
102                        theLuceneCtx
103                                        .analyzer(STANDARD_ANALYZER)
104                                        .custom()
105                                        .tokenizer(StandardTokenizerFactory.NAME)
106                                        .tokenFilter(LowerCaseFilterFactory.NAME)
107                                        .tokenFilter(ASCIIFoldingFilterFactory.NAME);
108
109                        theLuceneCtx
110                                        .analyzer(NORM_STRING_ANALYZER)
111                                        .custom()
112                                        .tokenizer(KeywordTokenizerFactory.NAME)
113                                        .tokenFilter(LowerCaseFilterFactory.NAME)
114                                        .tokenFilter(ASCIIFoldingFilterFactory.NAME);
115
116                        theLuceneCtx.analyzer(EXACT_ANALYZER).custom().tokenizer(KeywordTokenizerFactory.NAME);
117
118                        theLuceneCtx.analyzer("conceptParentPidsAnalyzer").custom().tokenizer(WhitespaceTokenizerFactory.NAME);
119
120                        theLuceneCtx
121                                        .normalizer(LOWERCASE_ASCIIFOLDING_NORMALIZER)
122                                        .custom()
123                                        .tokenFilter(LowerCaseFilterFactory.NAME)
124                                        .tokenFilter(ASCIIFoldingFilterFactory.NAME);
125                }
126        }
127
128        public static class HapiElasticsearchAnalysisConfigurer implements ElasticsearchAnalysisConfigurer {
129
130                @Override
131                public void configure(ElasticsearchAnalysisConfigurationContext theConfigCtx) {
132
133                        theConfigCtx
134                                        .analyzer("autocompleteEdgeAnalyzer")
135                                        .custom()
136                                        .tokenizer("pattern_all")
137                                        .tokenFilters("lowercase", "stop", "edgengram_3_50");
138
139                        theConfigCtx
140                                        .tokenizer("pattern_all")
141                                        .type("pattern")
142                                        .param("pattern", "(.*)")
143                                        .param("group", "1");
144
145                        theConfigCtx
146                                        .tokenFilter("edgengram_3_50")
147                                        .type("edge_ngram")
148                                        .param("min_gram", "3")
149                                        .param("max_gram", "50");
150
151                        theConfigCtx
152                                        .analyzer("autocompleteWordEdgeAnalyzer")
153                                        .custom()
154                                        .tokenizer("standard")
155                                        .tokenFilters("lowercase", "stop", "wordedgengram_3_50");
156
157                        theConfigCtx
158                                        .tokenFilter("wordedgengram_3_50")
159                                        .type("edge_ngram")
160                                        .param("min_gram", "3")
161                                        .param("max_gram", "20");
162
163                        theConfigCtx
164                                        .analyzer("autocompletePhoneticAnalyzer")
165                                        .custom()
166                                        .tokenizer("standard")
167                                        .tokenFilters("stop", "snowball_english");
168
169                        theConfigCtx.tokenFilter("snowball_english").type("snowball").param("language", "English");
170
171                        theConfigCtx
172                                        .analyzer("autocompleteNGramAnalyzer")
173                                        .custom()
174                                        .tokenizer("standard")
175                                        .tokenFilters("word_delimiter", "lowercase", "ngram_3_20");
176
177                        theConfigCtx
178                                        .tokenFilter("ngram_3_20")
179                                        .type("ngram")
180                                        .param("min_gram", "3")
181                                        .param("max_gram", "20");
182
183                        theConfigCtx
184                                        .analyzer(HapiLuceneAnalysisConfigurer.STANDARD_ANALYZER)
185                                        .custom()
186                                        .tokenizer("standard")
187                                        .tokenFilters("lowercase", "asciifolding");
188
189                        theConfigCtx
190                                        .analyzer(HapiLuceneAnalysisConfigurer.NORM_STRING_ANALYZER)
191                                        .custom()
192                                        .tokenizer("keyword") // We need the whole string to match, including whitespace.
193                                        .tokenFilters("lowercase", "asciifolding");
194
195                        theConfigCtx.analyzer("exactAnalyzer").custom().tokenizer("keyword").tokenFilters("unique");
196
197                        theConfigCtx.analyzer("conceptParentPidsAnalyzer").custom().tokenizer("whitespace");
198
199                        theConfigCtx
200                                        .normalizer(LOWERCASE_ASCIIFOLDING_NORMALIZER)
201                                        .custom()
202                                        .tokenFilters("lowercase", "asciifolding");
203                }
204        }
205}