001package ca.uhn.fhir.jpa.search;
002
003/*-
004 * #%L
005 * HAPI FHIR JPA Server
006 * %%
007 * Copyright (C) 2014 - 2023 Smile CDR, Inc.
008 * %%
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *      http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 * #L%
021 */
022
023import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
024import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
025import org.apache.lucene.analysis.core.StopFilterFactory;
026import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
027import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
028import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory;
029import org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory;
030import org.apache.lucene.analysis.ngram.NGramFilterFactory;
031import org.apache.lucene.analysis.pattern.PatternTokenizerFactory;
032import org.apache.lucene.analysis.phonetic.PhoneticFilterFactory;
033import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
034import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
035import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurationContext;
036import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer;
037import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurationContext;
038import org.hibernate.search.backend.lucene.analysis.LuceneAnalysisConfigurer;
039import org.springframework.stereotype.Component;
040
041import static ca.uhn.fhir.jpa.model.search.SearchParamTextPropertyBinder.LOWERCASE_ASCIIFOLDING_NORMALIZER;
042
043/**
044 * Class includes configuration classes for both Lucene and Elasticsearch as they usually need to be updated
045 * simultaneously, and otherwise is very easy to miss the second
046 */
047@Component
048public class HapiHSearchAnalysisConfigurers {
049
050        /**
051         * Factory for defining the analysers.
052         */
053        public static class HapiLuceneAnalysisConfigurer implements LuceneAnalysisConfigurer {
054
055                public static final String STANDARD_ANALYZER = "standardAnalyzer";
056                public static final String NORM_STRING_ANALYZER = "normStringAnalyzer";
057                public static final String EXACT_ANALYZER = "exactAnalyzer";
058
059                @Override
060                public void configure(LuceneAnalysisConfigurationContext theLuceneCtx) {
061                        theLuceneCtx.analyzer("autocompleteEdgeAnalyzer").custom()
062                                .tokenizer(PatternTokenizerFactory.class).param("pattern", "(.*)").param("group", "1")
063                                .tokenFilter(LowerCaseFilterFactory.class)
064                                .tokenFilter(StopFilterFactory.class)
065                                .tokenFilter(EdgeNGramFilterFactory.class)
066                                .param("minGramSize", "3")
067                                .param("maxGramSize", "50");
068
069                        theLuceneCtx.analyzer("autocompletePhoneticAnalyzer").custom()
070                                .tokenizer(StandardTokenizerFactory.class)
071                                .tokenFilter(StopFilterFactory.class)
072                                .tokenFilter(PhoneticFilterFactory.class).param("encoder", "DoubleMetaphone")
073                                .tokenFilter(SnowballPorterFilterFactory.class).param("language", "English");
074
075                        theLuceneCtx.analyzer("autocompleteNGramAnalyzer").custom()
076                                .tokenizer(StandardTokenizerFactory.class)
077                                .tokenFilter(WordDelimiterGraphFilterFactory.class)
078                                .tokenFilter(LowerCaseFilterFactory.class)
079                                .tokenFilter(NGramFilterFactory.class)
080                                .param("minGramSize", "3")
081                                .param("maxGramSize", "20");
082
083                        theLuceneCtx.analyzer("autocompleteWordEdgeAnalyzer").custom()
084                                .tokenizer(StandardTokenizerFactory.class)
085                                .tokenFilter(LowerCaseFilterFactory.class)
086                                .tokenFilter(StopFilterFactory.class)
087                                .tokenFilter(EdgeNGramFilterFactory.class)
088                                .param("minGramSize", "3")
089                                .param("maxGramSize", "20");
090
091                        theLuceneCtx.analyzer(STANDARD_ANALYZER).custom()
092                                .tokenizer(StandardTokenizerFactory.class)
093                                .tokenFilter(LowerCaseFilterFactory.class)
094                                .tokenFilter(ASCIIFoldingFilterFactory.class);
095
096                        theLuceneCtx.analyzer(NORM_STRING_ANALYZER).custom()
097                                .tokenizer(KeywordTokenizerFactory.class)
098                                .tokenFilter(LowerCaseFilterFactory.class)
099                                .tokenFilter(ASCIIFoldingFilterFactory.class);
100
101                        theLuceneCtx.analyzer(EXACT_ANALYZER).custom()
102                                .tokenizer(KeywordTokenizerFactory.class);
103
104                        theLuceneCtx.analyzer("conceptParentPidsAnalyzer").custom()
105                                .tokenizer(WhitespaceTokenizerFactory.class);
106
107                        theLuceneCtx.normalizer(LOWERCASE_ASCIIFOLDING_NORMALIZER).custom()
108                                .tokenFilter(LowerCaseFilterFactory.class)
109                                .tokenFilter(ASCIIFoldingFilterFactory.class);
110
111                }
112        }
113
114
115        public static class HapiElasticsearchAnalysisConfigurer implements ElasticsearchAnalysisConfigurer {
116
117                @Override
118                public void configure(ElasticsearchAnalysisConfigurationContext theConfigCtx) {
119
120                        theConfigCtx.analyzer("autocompleteEdgeAnalyzer").custom()
121                                .tokenizer("pattern_all")
122                                .tokenFilters("lowercase", "stop", "edgengram_3_50");
123
124                        theConfigCtx.tokenizer("pattern_all")
125                                .type("pattern")
126                                .param("pattern", "(.*)")
127                                .param("group", "1");
128
129                        theConfigCtx.tokenFilter("edgengram_3_50")
130                                .type("edge_ngram")
131                                .param("min_gram", "3")
132                                .param("max_gram", "50");
133
134
135                        theConfigCtx.analyzer("autocompleteWordEdgeAnalyzer").custom()
136                                .tokenizer("standard")
137                                .tokenFilters("lowercase", "stop", "wordedgengram_3_50");
138
139                        theConfigCtx.tokenFilter("wordedgengram_3_50")
140                                .type("edge_ngram")
141                                .param("min_gram", "3")
142                                .param("max_gram", "20");
143
144                        theConfigCtx.analyzer("autocompletePhoneticAnalyzer").custom()
145                                .tokenizer("standard")
146                                .tokenFilters("stop", "snowball_english");
147
148                        theConfigCtx.tokenFilter("snowball_english")
149                                .type("snowball")
150                                .param("language", "English");
151
152                        theConfigCtx.analyzer("autocompleteNGramAnalyzer").custom()
153                                .tokenizer("standard")
154                                .tokenFilters("word_delimiter", "lowercase", "ngram_3_20");
155
156                        theConfigCtx.tokenFilter("ngram_3_20")
157                                .type("ngram")
158                                .param("min_gram", "3")
159                                .param("max_gram", "20");
160
161
162                        theConfigCtx.analyzer(HapiLuceneAnalysisConfigurer.STANDARD_ANALYZER).custom()
163                                .tokenizer("standard")
164                                .tokenFilters("lowercase", "asciifolding");
165
166                        theConfigCtx.analyzer(HapiLuceneAnalysisConfigurer.NORM_STRING_ANALYZER).custom()
167                                .tokenizer("keyword") // We need the whole string to match, including whitespace.
168                                .tokenFilters("lowercase", "asciifolding");
169
170                        theConfigCtx.analyzer("exactAnalyzer")
171                                .custom()
172                                .tokenizer("keyword")
173                                .tokenFilters("unique");
174
175                        theConfigCtx.analyzer("conceptParentPidsAnalyzer").custom()
176                                .tokenizer("whitespace");
177
178                        theConfigCtx.normalizer( LOWERCASE_ASCIIFOLDING_NORMALIZER ).custom()
179                                .tokenFilters( "lowercase", "asciifolding" );
180
181                }
182        }
183
184}