001/*- 002 * #%L 003 * HAPI FHIR - Server Framework 004 * %% 005 * Copyright (C) 2014 - 2025 Smile CDR, Inc. 006 * %% 007 * Licensed under the Apache License, Version 2.0 (the "License"); 008 * you may not use this file except in compliance with the License. 009 * You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 * #L% 019 */ 020package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers; 021 022import org.apache.commons.lang3.StringUtils; 023 024import java.util.ArrayList; 025import java.util.Arrays; 026import java.util.HashSet; 027import java.util.List; 028import java.util.Set; 029import java.util.stream.Collectors; 030 031/** 032 * 033 */ 034public class TitleStandardizer extends LastNameStandardizer { 035 036 private Set<String> myExceptions = 037 new HashSet<>(Arrays.asList("EAS", "EPS", "LLC", "LLP", "of", "at", "in", "and")); 038 private Set<String[]> myBiGramExceptions = new HashSet<String[]>(); 039 040 public TitleStandardizer() { 041 super(); 042 addDelimiters("/", ".", "|", ">", "<", "(", ")", ":", "!"); 043 addAllowed('(', ')', '@', ':', '!', '|', '>', '<'); 044 myBiGramExceptions.add(new String[] {"'", "s"}); 045 } 046 047 private void addAllowed(char... theCharacter) { 048 for (char ch : theCharacter) { 049 addAllowedExtendedAsciiRange((int) ch, (int) ch); 050 addAllowedNonLetterAndDigitCharacters(ch); 051 } 052 } 053 054 @Override 055 public String standardize(String theString) { 056 theString = replaceTranslates(theString); 057 058 return Arrays.stream(theString.split("\\s+")) 059 .map(String::trim) 060 .map(this::standardizeText) 061 .filter(s -> !StringUtils.isEmpty(s)) 062 .map(this::checkTitleExceptions) 063 .collect(Collectors.joining(" ")); 064 } 065 066 private List<String> split(String theString) { 067 int cursor = 0; 068 int start = 0; 069 070 List<String> retVal = new ArrayList<>(); 071 StringBuilder buf = new StringBuilder(); 072 073 while (cursor < theString.length()) { 074 int codePoint = theString.codePointAt(cursor); 075 cursor += Character.charCount(codePoint); 076 if (isNoiseCharacter(codePoint)) { 077 continue; 078 } 079 080 String str = new String(Character.toChars(codePoint)); 081 if (isDelimiter(str)) { 082 if (buf.length() != 0) { 083 retVal.add(buf.toString()); 084 buf.setLength(0); 085 } 086 retVal.add(str); 087 continue; 088 } 089 090 buf.append(str); 091 } 092 093 if (buf.length() != 0) { 094 retVal.add(buf.toString()); 095 } 096 097 return retVal; 098 } 099 100 protected String standardizeText(String theToken) { 101 StringBuilder buf = new StringBuilder(); 102 List<String> parts = split(theToken); 103 104 String prevPart = null; 105 for (String part : parts) { 106 if (isAllText(part)) { 107 part = standardizeNameToken(part); 108 } 109 110 part = checkBiGram(prevPart, part); 111 buf.append(part); 112 prevPart = part; 113 } 114 return buf.toString(); 115 } 116 117 private String checkBiGram(String thePart0, String thePart1) { 118 for (String[] biGram : myBiGramExceptions) { 119 if (biGram[0].equalsIgnoreCase(thePart0) && biGram[1].equalsIgnoreCase(thePart1)) { 120 return biGram[1]; 121 } 122 } 123 return thePart1; 124 } 125 126 private boolean isAllText(String thePart) { 127 for (int offset = 0; offset < thePart.length(); ) { 128 int codePoint = thePart.codePointAt(offset); 129 if (!Character.isLetter(codePoint)) { 130 return false; 131 } 132 offset += Character.charCount(codePoint); 133 } 134 return true; 135 } 136 137 @Override 138 protected String standardizeNameToken(String theToken) { 139 String exception = myExceptions.stream() 140 .filter(s -> s.equalsIgnoreCase(theToken)) 141 .findFirst() 142 .orElse(null); 143 if (exception != null) { 144 return exception; 145 } 146 147 return super.standardizeNameToken(theToken); 148 } 149 150 private String checkTitleExceptions(String theString) { 151 return myExceptions.stream() 152 .filter(s -> s.equalsIgnoreCase(theString)) 153 .findFirst() 154 .orElse(theString); 155 } 156}