001/*-
002 * #%L
003 * HAPI FHIR - Server Framework
004 * %%
005 * Copyright (C) 2014 - 2024 Smile CDR, Inc.
006 * %%
007 * Licensed under the Apache License, Version 2.0 (the "License");
008 * you may not use this file except in compliance with the License.
009 * You may obtain a copy of the License at
010 *
011 *      http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 * #L%
019 */
020package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers;
021
022import org.apache.commons.lang3.StringUtils;
023
024import java.util.ArrayList;
025import java.util.Arrays;
026import java.util.HashSet;
027import java.util.List;
028import java.util.Set;
029import java.util.stream.Collectors;
030
031/**
032 *
033 */
034public class TitleStandardizer extends LastNameStandardizer {
035
036        private Set<String> myExceptions =
037                        new HashSet<>(Arrays.asList("EAS", "EPS", "LLC", "LLP", "of", "at", "in", "and"));
038        private Set<String[]> myBiGramExceptions = new HashSet<String[]>();
039
040        public TitleStandardizer() {
041                super();
042                addDelimiters("/", ".", "|", ">", "<", "(", ")", ":", "!");
043                addAllowed('(', ')', '@', ':', '!', '|', '>', '<');
044                myBiGramExceptions.add(new String[] {"'", "s"});
045        }
046
047        private void addAllowed(char... theCharacter) {
048                for (char ch : theCharacter) {
049                        addAllowedExtendedAsciiRange((int) ch, (int) ch);
050                        addAllowedNonLetterAndDigitCharacters(ch);
051                }
052        }
053
054        @Override
055        public String standardize(String theString) {
056                theString = replaceTranslates(theString);
057
058                return Arrays.stream(theString.split("\\s+"))
059                                .map(String::trim)
060                                .map(this::standardizeText)
061                                .filter(s -> !StringUtils.isEmpty(s))
062                                .map(this::checkTitleExceptions)
063                                .collect(Collectors.joining(" "));
064        }
065
066        private List<String> split(String theString) {
067                int cursor = 0;
068                int start = 0;
069
070                List<String> retVal = new ArrayList<>();
071                StringBuilder buf = new StringBuilder();
072
073                while (cursor < theString.length()) {
074                        int codePoint = theString.codePointAt(cursor);
075                        cursor += Character.charCount(codePoint);
076                        if (isNoiseCharacter(codePoint)) {
077                                continue;
078                        }
079
080                        String str = new String(Character.toChars(codePoint));
081                        if (isDelimiter(str)) {
082                                if (buf.length() != 0) {
083                                        retVal.add(buf.toString());
084                                        buf.setLength(0);
085                                }
086                                retVal.add(str);
087                                continue;
088                        }
089
090                        buf.append(str);
091                }
092
093                if (buf.length() != 0) {
094                        retVal.add(buf.toString());
095                }
096
097                return retVal;
098        }
099
100        protected String standardizeText(String theToken) {
101                StringBuilder buf = new StringBuilder();
102                List<String> parts = split(theToken);
103
104                String prevPart = null;
105                for (String part : parts) {
106                        if (isAllText(part)) {
107                                part = standardizeNameToken(part);
108                        }
109
110                        part = checkBiGram(prevPart, part);
111                        buf.append(part);
112                        prevPart = part;
113                }
114                return buf.toString();
115        }
116
117        private String checkBiGram(String thePart0, String thePart1) {
118                for (String[] biGram : myBiGramExceptions) {
119                        if (biGram[0].equalsIgnoreCase(thePart0) && biGram[1].equalsIgnoreCase(thePart1)) {
120                                return biGram[1];
121                        }
122                }
123                return thePart1;
124        }
125
126        private boolean isAllText(String thePart) {
127                for (int offset = 0; offset < thePart.length(); ) {
128                        int codePoint = thePart.codePointAt(offset);
129                        if (!Character.isLetter(codePoint)) {
130                                return false;
131                        }
132                        offset += Character.charCount(codePoint);
133                }
134                return true;
135        }
136
137        @Override
138        protected String standardizeNameToken(String theToken) {
139                String exception = myExceptions.stream()
140                                .filter(s -> s.equalsIgnoreCase(theToken))
141                                .findFirst()
142                                .orElse(null);
143                if (exception != null) {
144                        return exception;
145                }
146
147                return super.standardizeNameToken(theToken);
148        }
149
150        private String checkTitleExceptions(String theString) {
151                return myExceptions.stream()
152                                .filter(s -> s.equalsIgnoreCase(theString))
153                                .findFirst()
154                                .orElse(theString);
155        }
156}