001/*- 002 * #%L 003 * HAPI FHIR - Server Framework 004 * %% 005 * Copyright (C) 2014 - 2024 Smile CDR, Inc. 006 * %% 007 * Licensed under the Apache License, Version 2.0 (the "License"); 008 * you may not use this file except in compliance with the License. 009 * You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 * #L% 019 */ 020package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers; 021 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.HashMap; 025import java.util.HashSet; 026import java.util.List; 027import java.util.Map; 028import java.util.Set; 029import java.util.regex.Pattern; 030import java.util.stream.Collectors; 031 032/** 033 * Standardizes text literals by removing noise characters. 034 */ 035public class TextStandardizer implements IStandardizer { 036 037 public static final Pattern DIACRITICAL_MARKS = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); 038 039 public static final int EXT_ASCII_RANGE_START = 155; 040 public static final int EXT_ASCII_RANGE_END = 255; 041 042 private List<Range> myAllowedExtendedAscii; 043 private Set<Integer> myAllowedNonLetterAndDigitCharacters = new HashSet<>(); 044 private NoiseCharacters myNoiseCharacters = new NoiseCharacters(); 045 private Map<Integer, Character> myTranslates = new HashMap<>(); 046 047 public TextStandardizer() { 048 myNoiseCharacters.initializeFromClasspath(); 049 050 initializeAllowedNonLetterAndDigitCharacters(); 051 initializeTranslates(); 052 initializeAllowedExtendedAscii(); 053 } 054 055 protected void initializeAllowedNonLetterAndDigitCharacters() { 056 addAllowedNonLetterAndDigitCharacters('.', '\'', ',', '-', '#', '/', '\\', ' '); 057 } 058 059 protected TextStandardizer addAllowedNonLetterAndDigitCharacters(Character... theCharacters) { 060 myAllowedNonLetterAndDigitCharacters.addAll(asSet(theCharacters)); 061 return this; 062 } 063 064 protected Set<Integer> asSet(Character... theCharacters) { 065 return Arrays.stream(theCharacters).map(c -> (int) c).collect(Collectors.toSet()); 066 } 067 068 protected TextStandardizer addTranslate(int theTranslate, char theMapping) { 069 myTranslates.put(theTranslate, theMapping); 070 return this; 071 } 072 073 protected void initializeTranslates() { 074 addTranslate(0x0080, '\''); // PAD 075 addTranslate(0x00A0, ' '); //   076 addTranslate((int) ' ', ' '); //   077 addTranslate(0x201C, '"'); 078 addTranslate(0x201D, '"'); 079 addTranslate(0x2019, ' '); 080 addTranslate(0x2018, ' '); 081 addTranslate(0x02BD, ' '); 082 addTranslate(0x00B4, ' '); 083 addTranslate(0x02DD, '"'); 084 addTranslate((int) '?', '-'); 085 addTranslate((int) '-', '-'); 086 addTranslate((int) '~', '-'); 087 } 088 089 protected void initializeAllowedExtendedAscii() { 090 myAllowedExtendedAscii = new ArrayList<>(); 091 092 // refer to https://www.ascii-code.com for the codes 093 for (int[] i : new int[][] {{192, 214}, {216, 246}, {248, 255}}) { 094 addAllowedExtendedAsciiRange(i[0], i[1]); 095 } 096 } 097 098 protected TextStandardizer addAllowedExtendedAsciiRange(int theRangeStart, int theRangeEnd) { 099 myAllowedExtendedAscii.add(new Range(theRangeStart, theRangeEnd)); 100 return this; 101 } 102 103 public String standardize(String theString) { 104 theString = replaceTranslates(theString); 105 return removeNoise(theString); 106 } 107 108 protected String replaceTranslates(String theString) { 109 StringBuilder buf = new StringBuilder(theString.length()); 110 for (char ch : theString.toCharArray()) { 111 if (myTranslates.containsKey((int) ch)) { 112 buf.append(myTranslates.get((int) ch)); 113 } else { 114 buf.append(ch); 115 } 116 } 117 return buf.toString(); 118 } 119 120 protected String replaceAccents(String theString) { 121 String string = java.text.Normalizer.normalize(theString, java.text.Normalizer.Form.NFD); 122 return DIACRITICAL_MARKS.matcher(string).replaceAll(""); 123 } 124 125 protected String removeNoise(String theToken) { 126 StringBuilder token = new StringBuilder(theToken.length()); 127 for (int offset = 0; offset < theToken.length(); ) { 128 int codePoint = theToken.codePointAt(offset); 129 offset += Character.charCount(codePoint); 130 131 switch (Character.getType(codePoint)) { 132 case Character.CONTROL: // \p{Cc} 133 case Character.FORMAT: // \p{Cf} 134 case Character.PRIVATE_USE: // \p{Co} 135 case Character.SURROGATE: // \p{Cs} 136 case Character.UNASSIGNED: // \p{Cn} 137 break; 138 default: 139 if (!isNoiseCharacter(codePoint)) { 140 token.append(Character.toChars(codePoint)); 141 } 142 break; 143 } 144 } 145 return token.toString(); 146 } 147 148 protected boolean isTranslate(int theChar) { 149 return myTranslates.containsKey(theChar); 150 } 151 152 protected boolean isNoiseCharacter(int theChar) { 153 if (myAllowedExtendedAscii.stream().anyMatch(r -> r.isInRange(theChar))) { 154 return false; 155 } 156 boolean isExtendedAscii = (theChar >= EXT_ASCII_RANGE_START && theChar <= EXT_ASCII_RANGE_END); 157 if (isExtendedAscii) { 158 return true; 159 } 160 return myNoiseCharacters.isNoise(theChar); 161 } 162}