/* * Copyright 2008 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.template.soy.base.internal; import static java.nio.charset.StandardCharsets.UTF_8; import com.google.common.base.CharMatcher; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; import com.google.common.hash.Hashing; import java.io.File; import java.util.List; import java.util.Set; import java.util.regex.Pattern; /** * Base utilities for Soy code. * *

Important: Do not use outside of Soy code (treat as superpackage-private). * */ public class BaseUtils { private BaseUtils() {} /** Used by {@code ensureDirsExistInPath()}. Keeps track of known existing directory paths. */ private static final Set KNOWN_EXISTING_DIRS = Sets.newHashSet(); /** Regular expression for an identifier. */ public static final String IDENT_RE = "[a-zA-Z_][a-zA-Z_0-9]*"; /** Pattern for an identifier. */ private static final Pattern IDENT_PATTERN = Pattern.compile(IDENT_RE); /** Pattern for an identifier with leading dot. */ private static final Pattern IDENT_WITH_LEADING_DOT_PATTERN = Pattern.compile("[.]" + IDENT_RE); /** Regular expression for a dotted identifier. */ public static final String DOTTED_IDENT_RE = IDENT_RE + "(?:[.]" + IDENT_RE + ")*"; /** Pattern for a dotted identifier. */ private static final Pattern DOTTED_IDENT_PATTERN = Pattern.compile(DOTTED_IDENT_RE); /** Regular expression for an identifier that also allows dashes. */ private static final String DASHED_IDENT_RE = IDENT_RE + "(?:[-][a-zA-Z_0-9]*)*"; /** Pattern for an identifier that allows either dots or dashes, but not both. */ private static final Pattern DOTTED_OR_DASHED_IDENT_PATTERN = Pattern.compile(String.format("(?:%s)|(?:%s)", DOTTED_IDENT_RE, DASHED_IDENT_RE)); /** Pattern for a leading or trailing underscore. */ private static final Pattern LEADING_OR_TRAILING_UNDERSCORE_PATTERN = Pattern.compile("^_+|_+\\Z"); /** Pattern for places to insert underscores to make an identifier name underscore-separated. */ private static final Pattern WORD_BOUNDARY_IN_IDENT_PATTERN = Pattern.compile( "(?<= [a-zA-Z])(?= [A-Z][a-z])" + // _ "| (?<= [a-zA-Z])(?= [0-9])" + // _ "| (?<= [0-9])(?= [a-zA-Z])", // _ Pattern.COMMENTS); /** Pattern for consecutive underscores. */ private static final Pattern CONSECUTIVE_UNDERSCORES_PATTERN = Pattern.compile("_ _ _*", Pattern.COMMENTS); /** Hex digits for Soy strings (requires upper-case hex digits). */ private static final char[] HEX_DIGITS = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; /** * Ensures that the directories in the given path exist, creating them if necessary. * * Note: If the path does not end with the separator char (slash in Linux), then the name at the * end is assumed to be the file name, so directories are only created down to its parent. * * @param path The path for which to ensure directories exist. */ public static void ensureDirsExistInPath(String path) { if (path == null || path.length() == 0) { throw new AssertionError( "ensureDirsExistInPath called with null or empty path."); } String dirPath = (path.charAt(path.length() - 1) == File.separatorChar) ? path.substring(0, path.length() - 1) : (new File(path)).getParent(); if (dirPath == null || KNOWN_EXISTING_DIRS.contains(dirPath)) { return; // known to exist } else { (new File(dirPath)).mkdirs(); KNOWN_EXISTING_DIRS.add(dirPath); } } /** * Determines whether the given string is an identifier. * *

An identifier must start with a letter or underscore and must only contain letters, digits, * and underscores (i.e. it must match the regular expression {@code [A-Za-z_][A-Za-z_0-9]*}). * * @param s The string to check. * @return True if the given string is an identifier. */ public static boolean isIdentifier(String s) { return IDENT_PATTERN.matcher(s).matches(); } /** * Determines whether the given string is a dot followed by an identifier. * * @param s The string to check. * @return True if the given string is a dot followed by an identifier. */ public static boolean isIdentifierWithLeadingDot(String s) { return IDENT_WITH_LEADING_DOT_PATTERN.matcher(s).matches(); } /** * Determines whether the given string is a "dotted or dashed" identifier. * This allows dots, or dashes, but not both. * * @param s The string to check. * @return True if the given string is a dotted-or-dashed identifier. */ public static boolean isDottedOrDashedIdent(String s) { return DOTTED_OR_DASHED_IDENT_PATTERN.matcher(s).matches(); } /** * Determines whether the given string is a dotted identifier (e.g. {@code boo.foo0._goo}). A * dotted identifier is not required to have dots (i.e. a simple identifier qualifies as a dotted * identifier). * * @param s The string to check. * @return True if the given string is a dotted identifier (e.g. {@code boo.foo0._goo}). */ public static boolean isDottedIdentifier(String s) { return DOTTED_IDENT_PATTERN.matcher(s).matches(); } /** * Gets the part after the last dot in a dotted identifier. If there are no dots, returns the * whole input string. *

Important: The input must be a dotted identifier. This is not checked. */ public static String extractPartAfterLastDot(String dottedIdent) { int lastDotIndex = dottedIdent.lastIndexOf('.'); return (lastDotIndex == -1) ? dottedIdent : dottedIdent.substring(lastDotIndex + 1); } /** * Converts an identifier to upper-underscore format. The identifier must start with a letter or * underscore and must only contain letters, digits, and underscores (i.e. it must match the * regular expression {@code [A-Za-z_][A-Za-z_0-9]*}). * * @param ident The identifer to convert. * @return The identifier in upper-underscore format. */ public static String convertToUpperUnderscore(String ident) { ident = LEADING_OR_TRAILING_UNDERSCORE_PATTERN.matcher(ident).replaceAll(""); ident = WORD_BOUNDARY_IN_IDENT_PATTERN.matcher(ident).replaceAll("_"); ident = CONSECUTIVE_UNDERSCORES_PATTERN.matcher(ident).replaceAll("_"); return ident.toUpperCase(); } /** * Builds a Soy string literal for this string value (including the surrounding single quotes). * Note that Soy string syntax is a subset of JS string syntax, so the result should also be a * valid JS string. * * Adapted from StringUtil.javaScriptEscape(). * * @param value The string value to escape. * @param shouldEscapeToAscii Whether to escape non-ASCII characters as Unicode hex escapes * (backslash + 'u' + 4 hex digits). * @return A Soy string literal for this string value (including the surrounding single quotes). */ public static String escapeToSoyString(String value, boolean shouldEscapeToAscii) { // StringUtil.javaScriptEscape() is meant to be compatible with JS string syntax, which is a // superset of the Soy expression string syntax, so we can't depend on it to properly escape a // Soy expression string literal. For example, they switched the default character escaping // to octal to save a few bytes, but octal escapes are not allowed in Soy syntax. I'm rewriting // the code here in a correct way for Soy. int len = value.length(); StringBuilder out = new StringBuilder(len * 9 / 8); out.append('\''); int codePoint; for (int i = 0; i < len; i += Character.charCount(codePoint)) { codePoint = value.codePointAt(i); switch (codePoint) { case '\n': out.append("\\n"); break; case '\r': out.append("\\r"); break; case '\t': out.append("\\t"); break; case '\b': out.append("\\b"); break; case '\f': out.append("\\f"); break; case '\\': out.append("\\\\"); break; case '\'': out.append("\\'"); break; case '"' : out.append('"'); break; // note: don't escape double quotes in Soy strings default: // If shouldEscapeToAscii, then hex escape characters outside the range 0x20 to 0x7F. if (shouldEscapeToAscii && (codePoint < 0x20 || codePoint >= 0x7F)) { appendHexEscape(out, codePoint); } else { out.appendCodePoint(codePoint); } break; } } out.append('\''); return out.toString(); } /** * Appends the Unicode hex escape sequence for the given code point (backslash + 'u' + 4 hex * digits) to the given StringBuilder. * * Note: May append 2 escape sequences (surrogate pair) in the case of a supplementary character * (outside the Unicode BMP). * * Adapted from StringUtil.appendHexJavaScriptRepresentation(). * * @param out The StringBuilder to append to. * @param codePoint The Unicode code point whose hex escape sequence to append. */ public static void appendHexEscape(StringBuilder out, int codePoint) { if (Character.isSupplementaryCodePoint(codePoint)) { // Handle supplementary unicode values which are not representable in // javascript. We deal with these by escaping them as two 4B sequences // so that they will round-trip properly when sent from java to javascript // and back. char[] surrogates = Character.toChars(codePoint); appendHexEscape(out, surrogates[0]); appendHexEscape(out, surrogates[1]); } else { out.append("\\u") .append(HEX_DIGITS[(codePoint >>> 12) & 0xF]) .append(HEX_DIGITS[(codePoint >>> 8) & 0xF]) .append(HEX_DIGITS[(codePoint >>> 4) & 0xF]) .append(HEX_DIGITS[ codePoint & 0xF]); } } /** * Computes the SHA-1 hash value of the input string's UTF-8 representation and returns the first * numBits bits of the result as a hex value in string form. * * @param strToHash The string to compute SHA-1 of. * @param numBits The number of bits worth to return. Must be a positive number at most 160 and * divisible by 8 (since we process the result 8 bits at a time). * @return The partial SHA-1 hash value as a hex string. */ public static String computePartialSha1AsHexString(String strToHash, int numBits) { Preconditions.checkArgument(numBits > 0 && numBits <= 160 && numBits % 8 == 0); int numBytes = numBits / 8; return Hashing.sha1().hashString(strToHash, UTF_8) .toString().substring(0, numBytes * 2); } private static final CharMatcher whitespaceOrComma = CharMatcher.whitespace().or(CharMatcher.is(',')).precomputed(); /** * A helper method for formating javacc ParseExceptions. * @param errorToken The piece of text that we were unable to parse. * @param expectedTokens The set of formatted tokens that we were expecting next. */ public static String formatParseExceptionDetails(String errorToken, List expectedTokens) { String details; int numExpectedTokens = expectedTokens.size(); if (numExpectedTokens != 0) { StringBuilder builder = new StringBuilder(": expected "); for (int i = 0; i < numExpectedTokens; i++) { builder.append(maybeQuoteForParseError(expectedTokens.get(i))); if (i != numExpectedTokens - 1) { builder.append(", "); } if (i == numExpectedTokens - 2) { builder.append("or "); } } details = builder.toString(); } else { details = ""; } return String.format("parse error at '%s'%s", errorToken, details); } private static String maybeQuoteForParseError(String token) { // the literal matches are surrounded in double quotes, remove them, unless the token starts // or ends with a whitespace character or contains a comma if (token.charAt(0) == '"' && token.charAt(token.length() - 1) == '"') { token = token.substring(1, token.length() - 1); } if (whitespaceOrComma.matchesAnyOf(token)) { token = "'" + token + "'"; } return token; } }