// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Author: dsites@google.com (Dick Sites) // #include "getonescriptspan.h" #include #include "fixunicodevalue.h" #include "lang_script.h" #include "port.h" #include "utf8statetable.h" #include "utf8prop_lettermarkscriptnum.h" #include "utf8repl_lettermarklower.h" #include "utf8scannot_lettermarkspecial.h" namespace CLD2 { // Alphabetical order for binary search, from // generated_entities.cc extern const int kNameToEntitySize; extern const CharIntPair kNameToEntity[]; static const int kMaxUpToWordBoundary = 50; // span < this make longer, // else make shorter static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes // to round to word boundary, // direction above static const char kSpecialSymbol[256] = { // true for < > & 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, }; #define LT 0 // < #define GT 1 // > #define EX 2 // ! #define HY 3 // - #define QU 4 // " #define AP 5 // ' #define SL 6 // / #define S_ 7 #define C_ 8 #define R_ 9 #define I_ 10 #define P_ 11 #define T_ 12 #define Y_ 13 #define L_ 14 #define E_ 15 #define CR 16 // or #define NL 17 // non-letter: ASCII whitespace, digit, punctuation #define PL 18 // possible letter, incl. & #define xx 19 // // Map byte to one of ~20 interesting categories for cheap tag parsing static const uint8 kCharToSub[256] = { NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, }; #undef LT #undef GT #undef EX #undef HY #undef QU #undef AP #undef SL #undef S_ #undef C_ #undef R_ #undef I_ #undef P_ #undef T_ #undef Y_ #undef L_ #undef E_ #undef CR #undef NL #undef PL #undef xx #define OK 0 #define X_ 1 static const int kMaxExitStateLettersMarksOnly = 1; static const int kMaxExitStateAllText = 2; // State machine to do cheap parse of non-letter strings incl. tags // advances // | | // advances ... for