001/////////////////////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code and other text files for adherence to a set of rules. 003// Copyright (C) 2001-2024 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018/////////////////////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.Arrays; 023import java.util.List; 024import java.util.Map; 025import java.util.regex.Matcher; 026import java.util.regex.Pattern; 027 028import com.puppycrawl.tools.checkstyle.FileStatefulCheck; 029import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 030import com.puppycrawl.tools.checkstyle.api.DetailAST; 031import com.puppycrawl.tools.checkstyle.api.TextBlock; 032import com.puppycrawl.tools.checkstyle.api.TokenTypes; 033import com.puppycrawl.tools.checkstyle.utils.CheckUtil; 034import com.puppycrawl.tools.checkstyle.utils.CodePointUtil; 035 036/** 037 * <div> 038 * Restricts using 039 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3"> 040 * Unicode escapes</a> 041 * (such as \u221e). It is possible to allow using escapes for 042 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 043 * non-printable, control characters</a>. 044 * Also, this check can be configured to allow using escapes 045 * if trail comment is present. By the option it is possible to 046 * allow using escapes if literal contains only them. 047 * </div> 048 * 049 * <ul> 050 * <li> 051 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present. 052 * Type is {@code boolean}. 053 * Default value is {@code false}. 054 * </li> 055 * <li> 056 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for 057 * non-printable, control characters. 058 * Type is {@code boolean}. 059 * Default value is {@code false}. 060 * </li> 061 * <li> 062 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped. 063 * Type is {@code boolean}. 064 * Default value is {@code false}. 065 * </li> 066 * <li> 067 * Property {@code allowNonPrintableEscapes} - Allow use escapes for 068 * non-printable, whitespace characters. 069 * Type is {@code boolean}. 070 * Default value is {@code false}. 071 * </li> 072 * </ul> 073 * 074 * <p> 075 * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker} 076 * </p> 077 * 078 * <p> 079 * Violation Message Keys: 080 * </p> 081 * <ul> 082 * <li> 083 * {@code forbid.escaped.unicode.char} 084 * </li> 085 * </ul> 086 * 087 * @since 5.8 088 */ 089@FileStatefulCheck 090public class AvoidEscapedUnicodeCharactersCheck 091 extends AbstractCheck { 092 093 /** 094 * A key is pointing to the warning message text in "messages.properties" 095 * file. 096 */ 097 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 098 099 /** Regular expression for Unicode chars. */ 100 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F\\d]{4}"); 101 102 /** 103 * Regular expression Unicode control characters. 104 * 105 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 106 * Appendix:Control characters</a> 107 */ 108 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+" 109 + "(00[0-1][\\dA-Fa-f]" 110 + "|00[8-9][\\dA-Fa-f]" 111 + "|00[aA][dD]" 112 + "|034[fF]" 113 + "|070[fF]" 114 + "|180[eE]" 115 + "|200[b-fB-F]" 116 + "|202[a-eA-E]" 117 + "|206[0-4a-fA-F]" 118 + "|[fF]{3}[9a-bA-B]" 119 + "|[fF][eE][fF]{2})"); 120 121 /** 122 * Regular expression for all escaped chars. 123 * See <a href="https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7"> 124 * EscapeSequence</a> 125 */ 126 private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^(" 127 + UNICODE_REGEXP.pattern() 128 + "|\"" 129 + "|'" 130 + "|\\\\" 131 + "|\\\\b" 132 + "|\\\\f" 133 + "|\\\\n" 134 + "|\\R" 135 + "|\\\\r" 136 + "|\\\\s" 137 + "|\\\\t" 138 + ")+$"); 139 140 /** Regular expression for escaped backslash. */ 141 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 142 143 /** Regular expression for non-printable unicode chars. */ 144 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000" 145 + "|\\\\u0009" 146 + "|\\\\u000[bB]" 147 + "|\\\\u000[cC]" 148 + "|\\\\u0020" 149 + "|\\\\u007[fF]" 150 + "|\\\\u0085" 151 + "|\\\\u009[fF]" 152 + "|\\\\u00[aA]0" 153 + "|\\\\u00[aA][dD]" 154 + "|\\\\u04[fF]9" 155 + "|\\\\u05[bB][eE]" 156 + "|\\\\u05[dD]0" 157 + "|\\\\u05[eE][aA]" 158 + "|\\\\u05[fF]3" 159 + "|\\\\u05[fF]4" 160 + "|\\\\u0600" 161 + "|\\\\u0604" 162 + "|\\\\u061[cC]" 163 + "|\\\\u06[dD]{2}" 164 + "|\\\\u06[fF]{2}" 165 + "|\\\\u070[fF]" 166 + "|\\\\u0750" 167 + "|\\\\u077[fF]" 168 + "|\\\\u0[eE]00" 169 + "|\\\\u0[eE]7[fF]" 170 + "|\\\\u1680" 171 + "|\\\\u180[eE]" 172 + "|\\\\u1[eE]00" 173 + "|\\\\u2000" 174 + "|\\\\u2001" 175 + "|\\\\u2002" 176 + "|\\\\u2003" 177 + "|\\\\u2004" 178 + "|\\\\u2005" 179 + "|\\\\u2006" 180 + "|\\\\u2007" 181 + "|\\\\u2008" 182 + "|\\\\u2009" 183 + "|\\\\u200[aA]" 184 + "|\\\\u200[fF]" 185 + "|\\\\u2025" 186 + "|\\\\u2028" 187 + "|\\\\u2029" 188 + "|\\\\u202[fF]" 189 + "|\\\\u205[fF]" 190 + "|\\\\u2064" 191 + "|\\\\u2066" 192 + "|\\\\u2067" 193 + "|\\\\u2068" 194 + "|\\\\u2069" 195 + "|\\\\u206[aA]" 196 + "|\\\\u206[fF]" 197 + "|\\\\u20[aA][fF]" 198 + "|\\\\u2100" 199 + "|\\\\u213[aA]" 200 + "|\\\\u3000" 201 + "|\\\\u[dD]800" 202 + "|\\\\u[fF]8[fF]{2}" 203 + "|\\\\u[fF][bB]50" 204 + "|\\\\u[fF][dD][fF]{2}" 205 + "|\\\\u[fF][eE]70" 206 + "|\\\\u[fF][eE][fF]{2}" 207 + "|\\\\u[fF]{2}0[eE]" 208 + "|\\\\u[fF]{2}61" 209 + "|\\\\u[fF]{2}[dD][cC]" 210 + "|\\\\u[fF]{3}9" 211 + "|\\\\u[fF]{3}[aA]" 212 + "|\\\\u[fF]{3}[bB]" 213 + "|\\\\u[fF]{4}"); 214 215 /** Cpp style comments. */ 216 private Map<Integer, TextBlock> singlelineComments; 217 /** C style comments. */ 218 private Map<Integer, List<TextBlock>> blockComments; 219 220 /** Allow use escapes for non-printable, control characters. */ 221 private boolean allowEscapesForControlCharacters; 222 223 /** Allow use escapes if trail comment is present. */ 224 private boolean allowByTailComment; 225 226 /** Allow if all characters in literal are escaped. */ 227 private boolean allowIfAllCharactersEscaped; 228 229 /** Allow use escapes for non-printable, whitespace characters. */ 230 private boolean allowNonPrintableEscapes; 231 232 /** 233 * Setter to allow use escapes for non-printable, control characters. 234 * 235 * @param allow user's value. 236 * @since 5.8 237 */ 238 public final void setAllowEscapesForControlCharacters(boolean allow) { 239 allowEscapesForControlCharacters = allow; 240 } 241 242 /** 243 * Setter to allow use escapes if trail comment is present. 244 * 245 * @param allow user's value. 246 * @since 5.8 247 */ 248 public final void setAllowByTailComment(boolean allow) { 249 allowByTailComment = allow; 250 } 251 252 /** 253 * Setter to allow if all characters in literal are escaped. 254 * 255 * @param allow user's value. 256 * @since 5.8 257 */ 258 public final void setAllowIfAllCharactersEscaped(boolean allow) { 259 allowIfAllCharactersEscaped = allow; 260 } 261 262 /** 263 * Setter to allow use escapes for non-printable, whitespace characters. 264 * 265 * @param allow user's value. 266 * @since 5.8 267 */ 268 public final void setAllowNonPrintableEscapes(boolean allow) { 269 allowNonPrintableEscapes = allow; 270 } 271 272 @Override 273 public int[] getDefaultTokens() { 274 return getRequiredTokens(); 275 } 276 277 @Override 278 public int[] getAcceptableTokens() { 279 return getRequiredTokens(); 280 } 281 282 @Override 283 public int[] getRequiredTokens() { 284 return new int[] { 285 TokenTypes.STRING_LITERAL, 286 TokenTypes.CHAR_LITERAL, 287 TokenTypes.TEXT_BLOCK_CONTENT, 288 }; 289 } 290 291 // suppress deprecation until https://github.com/checkstyle/checkstyle/issues/11166 292 @SuppressWarnings("deprecation") 293 @Override 294 public void beginTree(DetailAST rootAST) { 295 singlelineComments = getFileContents().getSingleLineComments(); 296 blockComments = getFileContents().getBlockComments(); 297 } 298 299 @Override 300 public void visitToken(DetailAST ast) { 301 final String literal = 302 CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText()); 303 304 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 305 || isAllCharactersEscaped(literal) 306 || allowEscapesForControlCharacters 307 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 308 || allowNonPrintableEscapes 309 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 310 log(ast, MSG_KEY); 311 } 312 } 313 314 /** 315 * Checks if literal has Unicode chars. 316 * 317 * @param literal String literal. 318 * @return true if literal has Unicode chars. 319 */ 320 private static boolean hasUnicodeChar(String literal) { 321 final String literalWithoutEscapedBackslashes = 322 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 323 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 324 } 325 326 /** 327 * Check if String literal contains Unicode control chars. 328 * 329 * @param literal String literal. 330 * @param pattern RegExp for valid characters. 331 * @return true, if String literal contains Unicode control chars. 332 */ 333 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 334 final int unicodeMatchesCounter = 335 countMatches(UNICODE_REGEXP, literal); 336 final int unicodeValidMatchesCounter = 337 countMatches(pattern, literal); 338 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 339 } 340 341 /** 342 * Check if trail comment is present after ast token. 343 * 344 * @param ast current token. 345 * @return true if trail comment is present after ast token. 346 */ 347 private boolean hasTrailComment(DetailAST ast) { 348 int lineNo = ast.getLineNo(); 349 350 // Since the trailing comment in the case of text blocks must follow the """ delimiter, 351 // we need to look for it after TEXT_BLOCK_LITERAL_END. 352 if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) { 353 lineNo = ast.getNextSibling().getLineNo(); 354 } 355 boolean result = false; 356 if (singlelineComments.containsKey(lineNo)) { 357 result = true; 358 } 359 else { 360 final List<TextBlock> commentList = blockComments.get(lineNo); 361 if (commentList != null) { 362 final TextBlock comment = commentList.get(commentList.size() - 1); 363 final int[] codePoints = getLineCodePoints(lineNo - 1); 364 result = isTrailingBlockComment(comment, codePoints); 365 } 366 } 367 return result; 368 } 369 370 /** 371 * Whether the C style comment is trailing. 372 * 373 * @param comment the comment to check. 374 * @param codePoints the first line of the comment, in unicode code points 375 * @return true if the comment is trailing. 376 */ 377 private static boolean isTrailingBlockComment(TextBlock comment, int... codePoints) { 378 return comment.getText().length != 1 379 || CodePointUtil.isBlank(Arrays.copyOfRange(codePoints, 380 comment.getEndColNo() + 1, codePoints.length)); 381 } 382 383 /** 384 * Count regexp matches into String literal. 385 * 386 * @param pattern pattern. 387 * @param target String literal. 388 * @return count of regexp matches. 389 */ 390 private static int countMatches(Pattern pattern, String target) { 391 int matcherCounter = 0; 392 final Matcher matcher = pattern.matcher(target); 393 while (matcher.find()) { 394 matcherCounter++; 395 } 396 return matcherCounter; 397 } 398 399 /** 400 * Checks if all characters in String literal is escaped. 401 * 402 * @param literal current literal. 403 * @return true if all characters in String literal is escaped. 404 */ 405 private boolean isAllCharactersEscaped(String literal) { 406 return allowIfAllCharactersEscaped 407 && ALL_ESCAPED_CHARS.matcher(literal).find(); 408 } 409 410}