001/*-
002 * #%L
003 * HAPI FHIR - Server Framework
004 * %%
005 * Copyright (C) 2014 - 2024 Smile CDR, Inc.
006 * %%
007 * Licensed under the Apache License, Version 2.0 (the "License");
008 * you may not use this file except in compliance with the License.
009 * You may obtain a copy of the License at
010 *
011 *      http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 * #L%
019 */
020package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers;
021
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.HashMap;
025import java.util.HashSet;
026import java.util.List;
027import java.util.Map;
028import java.util.Set;
029import java.util.regex.Pattern;
030import java.util.stream.Collectors;
031
032/**
033 * Standardizes text literals by removing noise characters.
034 */
035public class TextStandardizer implements IStandardizer {
036
037        public static final Pattern DIACRITICAL_MARKS = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
038
039        public static final int EXT_ASCII_RANGE_START = 155;
040        public static final int EXT_ASCII_RANGE_END = 255;
041
042        private List<Range> myAllowedExtendedAscii;
043        private Set<Integer> myAllowedNonLetterAndDigitCharacters = new HashSet<>();
044        private NoiseCharacters myNoiseCharacters = new NoiseCharacters();
045        private Map<Integer, Character> myTranslates = new HashMap<>();
046
047        public TextStandardizer() {
048                myNoiseCharacters.initializeFromClasspath();
049
050                initializeAllowedNonLetterAndDigitCharacters();
051                initializeTranslates();
052                initializeAllowedExtendedAscii();
053        }
054
055        protected void initializeAllowedNonLetterAndDigitCharacters() {
056                addAllowedNonLetterAndDigitCharacters('.', '\'', ',', '-', '#', '/', '\\', ' ');
057        }
058
059        protected TextStandardizer addAllowedNonLetterAndDigitCharacters(Character... theCharacters) {
060                myAllowedNonLetterAndDigitCharacters.addAll(asSet(theCharacters));
061                return this;
062        }
063
064        protected Set<Integer> asSet(Character... theCharacters) {
065                return Arrays.stream(theCharacters).map(c -> (int) c).collect(Collectors.toSet());
066        }
067
068        protected TextStandardizer addTranslate(int theTranslate, char theMapping) {
069                myTranslates.put(theTranslate, theMapping);
070                return this;
071        }
072
073        protected void initializeTranslates() {
074                addTranslate(0x0080, '\''); // PAD
075                addTranslate(0x00A0, ' '); // &nbsp
076                addTranslate((int) ' ', ' '); // &nbsp
077                addTranslate(0x201C, '"');
078                addTranslate(0x201D, '"');
079                addTranslate(0x2019, ' ');
080                addTranslate(0x2018, ' ');
081                addTranslate(0x02BD, ' ');
082                addTranslate(0x00B4, ' ');
083                addTranslate(0x02DD, '"');
084                addTranslate((int) '?', '-');
085                addTranslate((int) '-', '-');
086                addTranslate((int) '~', '-');
087        }
088
089        protected void initializeAllowedExtendedAscii() {
090                myAllowedExtendedAscii = new ArrayList<>();
091
092                // refer to https://www.ascii-code.com for the codes
093                for (int[] i : new int[][] {{192, 214}, {216, 246}, {248, 255}}) {
094                        addAllowedExtendedAsciiRange(i[0], i[1]);
095                }
096        }
097
098        protected TextStandardizer addAllowedExtendedAsciiRange(int theRangeStart, int theRangeEnd) {
099                myAllowedExtendedAscii.add(new Range(theRangeStart, theRangeEnd));
100                return this;
101        }
102
103        public String standardize(String theString) {
104                theString = replaceTranslates(theString);
105                return removeNoise(theString);
106        }
107
108        protected String replaceTranslates(String theString) {
109                StringBuilder buf = new StringBuilder(theString.length());
110                for (char ch : theString.toCharArray()) {
111                        if (myTranslates.containsKey((int) ch)) {
112                                buf.append(myTranslates.get((int) ch));
113                        } else {
114                                buf.append(ch);
115                        }
116                }
117                return buf.toString();
118        }
119
120        protected String replaceAccents(String theString) {
121                String string = java.text.Normalizer.normalize(theString, java.text.Normalizer.Form.NFD);
122                return DIACRITICAL_MARKS.matcher(string).replaceAll("");
123        }
124
125        protected String removeNoise(String theToken) {
126                StringBuilder token = new StringBuilder(theToken.length());
127                for (int offset = 0; offset < theToken.length(); ) {
128                        int codePoint = theToken.codePointAt(offset);
129                        offset += Character.charCount(codePoint);
130
131                        switch (Character.getType(codePoint)) {
132                                case Character.CONTROL: // \p{Cc}
133                                case Character.FORMAT: // \p{Cf}
134                                case Character.PRIVATE_USE: // \p{Co}
135                                case Character.SURROGATE: // \p{Cs}
136                                case Character.UNASSIGNED: // \p{Cn}
137                                        break;
138                                default:
139                                        if (!isNoiseCharacter(codePoint)) {
140                                                token.append(Character.toChars(codePoint));
141                                        }
142                                        break;
143                        }
144                }
145                return token.toString();
146        }
147
148        protected boolean isTranslate(int theChar) {
149                return myTranslates.containsKey(theChar);
150        }
151
152        protected boolean isNoiseCharacter(int theChar) {
153                if (myAllowedExtendedAscii.stream().anyMatch(r -> r.isInRange(theChar))) {
154                        return false;
155                }
156                boolean isExtendedAscii = (theChar >= EXT_ASCII_RANGE_START && theChar <= EXT_ASCII_RANGE_END);
157                if (isExtendedAscii) {
158                        return true;
159                }
160                return myNoiseCharacters.isNoise(theChar);
161        }
162}