001///////////////////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code and other text files for adherence to a set of rules.
003// Copyright (C) 2001-2024 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018///////////////////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.Arrays;
023import java.util.List;
024import java.util.Map;
025import java.util.regex.Matcher;
026import java.util.regex.Pattern;
027
028import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
029import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
030import com.puppycrawl.tools.checkstyle.api.DetailAST;
031import com.puppycrawl.tools.checkstyle.api.TextBlock;
032import com.puppycrawl.tools.checkstyle.api.TokenTypes;
033import com.puppycrawl.tools.checkstyle.utils.CheckUtil;
034import com.puppycrawl.tools.checkstyle.utils.CodePointUtil;
035
036/**
037 * <div>
038 * Restricts using
039 * <a href = "https://docs.oracle.com/javase/specs/jls/se11/html/jls-3.html#jls-3.3">
040 * Unicode escapes</a>
041 * (such as &#92;u221e). It is possible to allow using escapes for
042 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
043 * non-printable, control characters</a>.
044 * Also, this check can be configured to allow using escapes
045 * if trail comment is present. By the option it is possible to
046 * allow using escapes if literal contains only them.
047 * </div>
048 *
049 * <ul>
050 * <li>
051 * Property {@code allowByTailComment} - Allow use escapes if trail comment is present.
052 * Type is {@code boolean}.
053 * Default value is {@code false}.
054 * </li>
055 * <li>
056 * Property {@code allowEscapesForControlCharacters} - Allow use escapes for
057 * non-printable, control characters.
058 * Type is {@code boolean}.
059 * Default value is {@code false}.
060 * </li>
061 * <li>
062 * Property {@code allowIfAllCharactersEscaped} - Allow if all characters in literal are escaped.
063 * Type is {@code boolean}.
064 * Default value is {@code false}.
065 * </li>
066 * <li>
067 * Property {@code allowNonPrintableEscapes} - Allow use escapes for
068 * non-printable, whitespace characters.
069 * Type is {@code boolean}.
070 * Default value is {@code false}.
071 * </li>
072 * </ul>
073 *
074 * <p>
075 * Parent is {@code com.puppycrawl.tools.checkstyle.TreeWalker}
076 * </p>
077 *
078 * <p>
079 * Violation Message Keys:
080 * </p>
081 * <ul>
082 * <li>
083 * {@code forbid.escaped.unicode.char}
084 * </li>
085 * </ul>
086 *
087 * @since 5.8
088 */
089@FileStatefulCheck
090public class AvoidEscapedUnicodeCharactersCheck
091    extends AbstractCheck {
092
093    /**
094     * A key is pointing to the warning message text in "messages.properties"
095     * file.
096     */
097    public static final String MSG_KEY = "forbid.escaped.unicode.char";
098
099    /** Regular expression for Unicode chars. */
100    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u+[a-fA-F\\d]{4}");
101
102    /**
103     * Regular expression Unicode control characters.
104     *
105     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
106     *     Appendix:Control characters</a>
107     */
108    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\u+"
109            + "(00[0-1][\\dA-Fa-f]"
110            + "|00[8-9][\\dA-Fa-f]"
111            + "|00[aA][dD]"
112            + "|034[fF]"
113            + "|070[fF]"
114            + "|180[eE]"
115            + "|200[b-fB-F]"
116            + "|202[a-eA-E]"
117            + "|206[0-4a-fA-F]"
118            + "|[fF]{3}[9a-bA-B]"
119            + "|[fF][eE][fF]{2})");
120
121    /**
122     * Regular expression for all escaped chars.
123     * See <a href="https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.10.7">
124     * EscapeSequence</a>
125     */
126    private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^("
127            + UNICODE_REGEXP.pattern()
128            + "|\""
129            + "|'"
130            + "|\\\\"
131            + "|\\\\b"
132            + "|\\\\f"
133            + "|\\\\n"
134            + "|\\R"
135            + "|\\\\r"
136            + "|\\\\s"
137            + "|\\\\t"
138            + ")+$");
139
140    /** Regular expression for escaped backslash. */
141    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
142
143    /** Regular expression for non-printable unicode chars. */
144    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u0000"
145            + "|\\\\u0009"
146            + "|\\\\u000[bB]"
147            + "|\\\\u000[cC]"
148            + "|\\\\u0020"
149            + "|\\\\u007[fF]"
150            + "|\\\\u0085"
151            + "|\\\\u009[fF]"
152            + "|\\\\u00[aA]0"
153            + "|\\\\u00[aA][dD]"
154            + "|\\\\u04[fF]9"
155            + "|\\\\u05[bB][eE]"
156            + "|\\\\u05[dD]0"
157            + "|\\\\u05[eE][aA]"
158            + "|\\\\u05[fF]3"
159            + "|\\\\u05[fF]4"
160            + "|\\\\u0600"
161            + "|\\\\u0604"
162            + "|\\\\u061[cC]"
163            + "|\\\\u06[dD]{2}"
164            + "|\\\\u06[fF]{2}"
165            + "|\\\\u070[fF]"
166            + "|\\\\u0750"
167            + "|\\\\u077[fF]"
168            + "|\\\\u0[eE]00"
169            + "|\\\\u0[eE]7[fF]"
170            + "|\\\\u1680"
171            + "|\\\\u180[eE]"
172            + "|\\\\u1[eE]00"
173            + "|\\\\u2000"
174            + "|\\\\u2001"
175            + "|\\\\u2002"
176            + "|\\\\u2003"
177            + "|\\\\u2004"
178            + "|\\\\u2005"
179            + "|\\\\u2006"
180            + "|\\\\u2007"
181            + "|\\\\u2008"
182            + "|\\\\u2009"
183            + "|\\\\u200[aA]"
184            + "|\\\\u200[fF]"
185            + "|\\\\u2025"
186            + "|\\\\u2028"
187            + "|\\\\u2029"
188            + "|\\\\u202[fF]"
189            + "|\\\\u205[fF]"
190            + "|\\\\u2064"
191            + "|\\\\u2066"
192            + "|\\\\u2067"
193            + "|\\\\u2068"
194            + "|\\\\u2069"
195            + "|\\\\u206[aA]"
196            + "|\\\\u206[fF]"
197            + "|\\\\u20[aA][fF]"
198            + "|\\\\u2100"
199            + "|\\\\u213[aA]"
200            + "|\\\\u3000"
201            + "|\\\\u[dD]800"
202            + "|\\\\u[fF]8[fF]{2}"
203            + "|\\\\u[fF][bB]50"
204            + "|\\\\u[fF][dD][fF]{2}"
205            + "|\\\\u[fF][eE]70"
206            + "|\\\\u[fF][eE][fF]{2}"
207            + "|\\\\u[fF]{2}0[eE]"
208            + "|\\\\u[fF]{2}61"
209            + "|\\\\u[fF]{2}[dD][cC]"
210            + "|\\\\u[fF]{3}9"
211            + "|\\\\u[fF]{3}[aA]"
212            + "|\\\\u[fF]{3}[bB]"
213            + "|\\\\u[fF]{4}");
214
215    /** Cpp style comments. */
216    private Map<Integer, TextBlock> singlelineComments;
217    /** C style comments. */
218    private Map<Integer, List<TextBlock>> blockComments;
219
220    /** Allow use escapes for non-printable, control characters. */
221    private boolean allowEscapesForControlCharacters;
222
223    /** Allow use escapes if trail comment is present. */
224    private boolean allowByTailComment;
225
226    /** Allow if all characters in literal are escaped. */
227    private boolean allowIfAllCharactersEscaped;
228
229    /** Allow use escapes for non-printable, whitespace characters. */
230    private boolean allowNonPrintableEscapes;
231
232    /**
233     * Setter to allow use escapes for non-printable, control characters.
234     *
235     * @param allow user's value.
236     * @since 5.8
237     */
238    public final void setAllowEscapesForControlCharacters(boolean allow) {
239        allowEscapesForControlCharacters = allow;
240    }
241
242    /**
243     * Setter to allow use escapes if trail comment is present.
244     *
245     * @param allow user's value.
246     * @since 5.8
247     */
248    public final void setAllowByTailComment(boolean allow) {
249        allowByTailComment = allow;
250    }
251
252    /**
253     * Setter to allow if all characters in literal are escaped.
254     *
255     * @param allow user's value.
256     * @since 5.8
257     */
258    public final void setAllowIfAllCharactersEscaped(boolean allow) {
259        allowIfAllCharactersEscaped = allow;
260    }
261
262    /**
263     * Setter to allow use escapes for non-printable, whitespace characters.
264     *
265     * @param allow user's value.
266     * @since 5.8
267     */
268    public final void setAllowNonPrintableEscapes(boolean allow) {
269        allowNonPrintableEscapes = allow;
270    }
271
272    @Override
273    public int[] getDefaultTokens() {
274        return getRequiredTokens();
275    }
276
277    @Override
278    public int[] getAcceptableTokens() {
279        return getRequiredTokens();
280    }
281
282    @Override
283    public int[] getRequiredTokens() {
284        return new int[] {
285            TokenTypes.STRING_LITERAL,
286            TokenTypes.CHAR_LITERAL,
287            TokenTypes.TEXT_BLOCK_CONTENT,
288        };
289    }
290
291    // suppress deprecation until https://github.com/checkstyle/checkstyle/issues/11166
292    @SuppressWarnings("deprecation")
293    @Override
294    public void beginTree(DetailAST rootAST) {
295        singlelineComments = getFileContents().getSingleLineComments();
296        blockComments = getFileContents().getBlockComments();
297    }
298
299    @Override
300    public void visitToken(DetailAST ast) {
301        final String literal =
302            CheckUtil.stripIndentAndInitialNewLineFromTextBlock(ast.getText());
303
304        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
305                || isAllCharactersEscaped(literal)
306                || allowEscapesForControlCharacters
307                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
308                || allowNonPrintableEscapes
309                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
310            log(ast, MSG_KEY);
311        }
312    }
313
314    /**
315     * Checks if literal has Unicode chars.
316     *
317     * @param literal String literal.
318     * @return true if literal has Unicode chars.
319     */
320    private static boolean hasUnicodeChar(String literal) {
321        final String literalWithoutEscapedBackslashes =
322                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
323        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
324    }
325
326    /**
327     * Check if String literal contains Unicode control chars.
328     *
329     * @param literal String literal.
330     * @param pattern RegExp for valid characters.
331     * @return true, if String literal contains Unicode control chars.
332     */
333    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
334        final int unicodeMatchesCounter =
335                countMatches(UNICODE_REGEXP, literal);
336        final int unicodeValidMatchesCounter =
337                countMatches(pattern, literal);
338        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
339    }
340
341    /**
342     * Check if trail comment is present after ast token.
343     *
344     * @param ast current token.
345     * @return true if trail comment is present after ast token.
346     */
347    private boolean hasTrailComment(DetailAST ast) {
348        int lineNo = ast.getLineNo();
349
350        // Since the trailing comment in the case of text blocks must follow the """ delimiter,
351        // we need to look for it after TEXT_BLOCK_LITERAL_END.
352        if (ast.getType() == TokenTypes.TEXT_BLOCK_CONTENT) {
353            lineNo = ast.getNextSibling().getLineNo();
354        }
355        boolean result = false;
356        if (singlelineComments.containsKey(lineNo)) {
357            result = true;
358        }
359        else {
360            final List<TextBlock> commentList = blockComments.get(lineNo);
361            if (commentList != null) {
362                final TextBlock comment = commentList.get(commentList.size() - 1);
363                final int[] codePoints = getLineCodePoints(lineNo - 1);
364                result = isTrailingBlockComment(comment, codePoints);
365            }
366        }
367        return result;
368    }
369
370    /**
371     * Whether the C style comment is trailing.
372     *
373     * @param comment the comment to check.
374     * @param codePoints the first line of the comment, in unicode code points
375     * @return true if the comment is trailing.
376     */
377    private static boolean isTrailingBlockComment(TextBlock comment, int... codePoints) {
378        return comment.getText().length != 1
379            || CodePointUtil.isBlank(Arrays.copyOfRange(codePoints,
380                comment.getEndColNo() + 1, codePoints.length));
381    }
382
383    /**
384     * Count regexp matches into String literal.
385     *
386     * @param pattern pattern.
387     * @param target String literal.
388     * @return count of regexp matches.
389     */
390    private static int countMatches(Pattern pattern, String target) {
391        int matcherCounter = 0;
392        final Matcher matcher = pattern.matcher(target);
393        while (matcher.find()) {
394            matcherCounter++;
395        }
396        return matcherCounter;
397    }
398
399    /**
400     * Checks if all characters in String literal is escaped.
401     *
402     * @param literal current literal.
403     * @return true if all characters in String literal is escaped.
404     */
405    private boolean isAllCharactersEscaped(String literal) {
406        return allowIfAllCharactersEscaped
407                && ALL_ESCAPED_CHARS.matcher(literal).find();
408    }
409
410}