001/*- 002 * #%L 003 * HAPI FHIR - Server Framework 004 * %% 005 * Copyright (C) 2014 - 2024 Smile CDR, Inc. 006 * %% 007 * Licensed under the Apache License, Version 2.0 (the "License"); 008 * you may not use this file except in compliance with the License. 009 * You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 * #L% 019 */ 020package ca.uhn.fhir.rest.server.interceptor.s13n.standardizers; 021 022import ca.uhn.fhir.i18n.Msg; 023import ca.uhn.fhir.rest.server.interceptor.ConfigLoader; 024 025import java.util.HashSet; 026import java.util.Scanner; 027import java.util.Set; 028 029public class NoiseCharacters { 030 031 private static final int RANGE_THRESHOLD = 150; 032 033 private Set<Integer> myNoiseCharacters = new HashSet<>(); 034 private Set<Range> myNoiseCharacterRanges = new HashSet<>(); 035 036 private int size; 037 038 public int getSize() { 039 return myNoiseCharacters.size(); 040 } 041 042 public void initializeFromClasspath() { 043 String noiseChars = ConfigLoader.loadResourceContent("classpath:noise-chars.txt"); 044 try (Scanner scanner = new Scanner(noiseChars)) { 045 while (scanner.hasNext()) { 046 parse(scanner.nextLine()); 047 } 048 } 049 } 050 051 public boolean isNoise(int theChar) { 052 if (myNoiseCharacters.contains(theChar)) { 053 return true; 054 } 055 056 for (Range r : myNoiseCharacterRanges) { 057 if (r.isInRange(theChar)) { 058 return true; 059 } 060 } 061 062 return false; 063 } 064 065 private void parse(String theString) { 066 if (theString.contains("-")) { 067 addRange(theString); 068 } else { 069 add(theString); 070 } 071 } 072 073 public NoiseCharacters add(String theLiteral) { 074 myNoiseCharacters.add(toInt(theLiteral)); 075 return this; 076 } 077 078 public NoiseCharacters addRange(String theRange) { 079 if (!theRange.contains("-")) { 080 throw new IllegalArgumentException(Msg.code(350) + String.format("Invalid range %s", theRange)); 081 } 082 083 String[] range = theRange.split("-"); 084 if (range.length < 2) { 085 throw new IllegalArgumentException(Msg.code(351) + String.format("Invalid range %s", theRange)); 086 } 087 088 addRange(range[0].trim(), range[1].trim()); 089 return this; 090 } 091 092 public NoiseCharacters addRange(String theLowerBound, String theUpperBound) { 093 int lower = toInt(theLowerBound); 094 int upper = toInt(theUpperBound); 095 096 if (lower > upper) { 097 throw new IllegalArgumentException( 098 Msg.code(352) + String.format("Invalid character range %s-%s", theLowerBound, theUpperBound)); 099 } 100 101 if (upper - lower >= RANGE_THRESHOLD) { 102 myNoiseCharacterRanges.add(new Range(lower, upper)); 103 return this; 104 } 105 106 for (int i = lower; i <= upper; i++) { 107 myNoiseCharacters.add(i); 108 } 109 return this; 110 } 111 112 private int toInt(String theLiteral) { 113 if (!theLiteral.startsWith("#x")) { 114 throw new IllegalArgumentException(Msg.code(353) + "Unable to parse " + theLiteral); 115 } 116 117 return Integer.parseInt(theLiteral.substring(2), 16); 118 } 119}