001package ca.uhn.fhir.util;
002
003/*-
004 * #%L
005 * HAPI FHIR - Core Library
006 * %%
007 * Copyright (C) 2014 - 2021 Smile CDR, Inc.
008 * %%
009 * Licensed under the Apache License, Version 2.0 (the "License");
010 * you may not use this file except in compliance with the License.
011 * You may obtain a copy of the License at
012 *
013 *      http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing, software
016 * distributed under the License is distributed on an "AS IS" BASIS,
017 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018 * See the License for the specific language governing permissions and
019 * limitations under the License.
020 * #L%
021 */
022
023import java.io.CharArrayWriter;
024import java.nio.charset.StandardCharsets;
025import java.text.Normalizer;
026import java.util.Arrays;
027
028public class StringUtil {
029
030        /**
031         * If a string ends with a given character, remove that character from the end of the string (as many times as it occurs at the end)
032         */
033        public static String chompCharacter(String theInput, char theCharacter) {
034                String retVal = theInput;
035                while (retVal != null && retVal.length() > 0 && retVal.charAt(retVal.length() - 1) == theCharacter) {
036                        retVal = retVal.substring(0, retVal.length() - 1);
037                }
038                return retVal;
039        }
040
041        public static String normalizeStringForSearchIndexing(String theString) {
042                if (theString == null) {
043                        return null;
044                }
045
046                CharArrayWriter outBuffer = new CharArrayWriter(theString.length());
047
048                /*
049                 * The following block of code is used to strip out diacritical marks from latin script
050                 * and also convert to upper case. E.g. "j?mes" becomes "JAMES".
051                 *
052                 * See http://www.unicode.org/charts/PDF/U0300.pdf for the logic
053                 * behind stripping 0300-036F
054                 *
055                 * See #454 for an issue where we were completely stripping non latin characters
056                 * See #832 for an issue where we normalize korean characters, which are decomposed
057                 */
058                String string = Normalizer.normalize(theString, Normalizer.Form.NFD);
059                for (int i = 0, n = string.length(); i < n; ++i) {
060                        char c = string.charAt(i);
061                        if (c >= '\u0300' && c <= '\u036F') {
062                                continue;
063                        } else {
064                                outBuffer.append(c);
065                        }
066                }
067
068                return new String(outBuffer.toCharArray()).toUpperCase();
069        }
070
071        public static String toUtf8String(byte[] theBytes) {
072                byte[] bytes = theBytes;
073                if (theBytes.length >= 3) {
074                        if (theBytes[0] == -17 && theBytes[1] == -69 && theBytes[2] == -65) {
075                                bytes = Arrays.copyOfRange(theBytes, 3, theBytes.length);
076                        }
077                }
078                return new String(bytes, StandardCharsets.UTF_8);
079        }
080
081        /**
082         * Gets the string prefix of the specified length.
083         *
084         * @param theString
085         *      String to get the prefix from
086         * @param theCodePointCount
087         *      Length of the prefix in code points
088         * @return
089         *      Returns the string prefix of the specified number of codepoints.
090         */
091        public static String left(String theString, int theCodePointCount) {
092                if (theString == null) {
093                        return null;
094                }
095
096                if (theCodePointCount < 0) {
097                        return "";
098                }
099
100                // char count can only be bigger than the code point count
101                if (theString.length() <= theCodePointCount) {
102                        return theString;
103                }
104
105                return theString.substring(0, theString.offsetByCodePoints(0, theCodePointCount));
106        }
107
108}