// Copyright 2013 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Author: dsites@google.com (Dick Sites) // Updated 2014.01 for dual table lookup // #include #include #include #include #include #include "cldutil.h" #include "debug.h" #include "integral_types.h" #include "lang_script.h" #include "utf8acceptinterchange.h" #include "utf8statetable.h" #ifdef CLD2_DYNAMIC_MODE #include "cld2_dynamic_data.h" #include "cld2_dynamic_data_loader.h" #endif #include "cld2tablesummary.h" #include "compact_lang_det_impl.h" #include "compact_lang_det_hint_code.h" #include "getonescriptspan.h" #include "tote.h" namespace CLD2 { using namespace std; // Linker supplies the right tables, From files // cld_generated_cjk_uni_prop_80.cc cld2_generated_cjk_compatible.cc // cld_generated_cjk_delta_bi_32.cc generated_distinct_bi_0.cc // cld2_generated_quad*.cc cld2_generated_deltaocta*.cc // cld2_generated_distinctocta*.cc // cld_generated_score_quad_octa_1024_256.cc // 2014.01 Now implementing quadgram dual lookup tables, to allow main table // sizes that are 1/3/5 times a power of two, instead of just powers of two. // Gives more flexibility of total footprint for CLD2. extern const int kLanguageToPLangSize; extern const int kCloseSetSize; extern const UTF8PropObj cld_generated_CjkUni_obj; extern const CLD2TableSummary kCjkCompat_obj; extern const CLD2TableSummary kCjkDeltaBi_obj; extern const CLD2TableSummary kDistinctBiTable_obj; extern const CLD2TableSummary kQuad_obj; extern const CLD2TableSummary kQuad_obj2; // Dual lookup tables extern const CLD2TableSummary kDeltaOcta_obj; extern const CLD2TableSummary kDistinctOcta_obj; extern const short kAvgDeltaOctaScore[]; // Returns the length in bytes of the prefix of src that is all // interchange valid UTF-8 int SpanInterchangeValid(const char* src, int byte_length) { int bytes_consumed; const UTF8ReplaceObj* st = &utf8acceptinterchange_obj; StringPiece str(src, byte_length); UTF8GenericScan(st, str, &bytes_consumed); return bytes_consumed; } #ifdef CLD2_DYNAMIC_MODE // CLD2_DYNAMIC_MODE is defined: // Data will be read from an mmap opened at runtime. // Convenience for nulling things out completely at any point. static ScoringTables NULL_TABLES = { NULL, //&cld_generated_CjkUni_obj, NULL, //&kCjkCompat_obj, NULL, //&kCjkDeltaBi_obj, NULL, //&kDistinctBiTable_obj, NULL, //&kQuad_obj, NULL, //&kQuad_obj2, NULL, //&kDeltaOcta_obj, NULL, //&kDistinctOcta_obj, NULL, //kAvgDeltaOctaScore, }; static ScoringTables kScoringtables = NULL_TABLES; // copy constructed static bool dynamicDataLoaded = false; static bool dataSourceIsFile = false; static ScoringTables* dynamicTables = NULL; static void* mmapAddress = NULL; static uint32_t mmapLength = 0; bool isDataLoaded() { return dynamicDataLoaded; } bool isDataDynamic() { return true; } // Because CLD2_DYNAMIC_MODE is defined void loadDataFromFile(const char* fileName) { if (isDataLoaded()) { unloadData(); } ScoringTables* result = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength); if (result == NULL) { fprintf(stderr, "WARNING: Dynamic data loading failed.\n"); return; } dynamicTables = result; kScoringtables = *dynamicTables; dataSourceIsFile = true; dynamicDataLoaded = true; }; void loadDataFromRawAddress(const void* rawAddress, const uint32_t length) { if (isDataLoaded()) { unloadData(); } ScoringTables* result = CLD2DynamicDataLoader::loadDataRaw(rawAddress, length); if (result == NULL) { fprintf(stderr, "WARNING: Dynamic data loading failed.\n"); return; } dynamicTables = result; kScoringtables = *dynamicTables; dataSourceIsFile = false; dynamicDataLoaded = true; } void unloadData() { if (!dynamicDataLoaded) return; if (dataSourceIsFile) { CLD2DynamicDataLoader::unloadDataFile(&dynamicTables, &mmapAddress, &mmapLength); } else { CLD2DynamicDataLoader::unloadDataRaw(&dynamicTables); } dynamicDataLoaded = false; dataSourceIsFile = false; // vacuous kScoringtables = NULL_TABLES; // Housekeeping: null all pointers } #else // !CLD2_DYNAMIC_MODE // This initializes kScoringtables.quadgram_obj etc. static const ScoringTables kScoringtables = { &cld_generated_CjkUni_obj, &kCjkCompat_obj, &kCjkDeltaBi_obj, &kDistinctBiTable_obj, &kQuad_obj, &kQuad_obj2, // Dual lookup tables &kDeltaOcta_obj, &kDistinctOcta_obj, kAvgDeltaOctaScore, }; // Method implementations below are provided so that callers aren't *forced* // to depend upon the CLD2_DYNAMIC_MODE flag, but can use runtime checks // instead. For more information, refer to CLD2 issue 16: // https://code.google.com/p/cld2/issues/detail?id=16 bool isDataLoaded() { return true; } // Data is statically linked bool isDataDynamic() { return false; } // Because CLD2_DYNAMIC_MODE is not defined void loadDataFromFile(const char* fileName) { // This is a bug in the calling code. fprintf(stderr, "WARNING: Dynamic mode not active, loadDataFromFile has no effect!\n"); } void loadDataFromRawAddress(const void* rawAddress, const uint32_t length) { // This is a bug in the calling code. fprintf(stderr, "WARNING: Dynamic mode not active, loadDataFromRawAddress has no effect!\n"); } void unloadData() { // This is a bug in the calling code. fprintf(stderr, "WARNING: Dynamic mode not active, unloadData has no effect!\n"); } #endif // #ifdef CLD2_DYNAMIC_MODE static const bool FLAGS_cld_no_minimum_bytes = false; static const bool FLAGS_cld_forcewords = true; static const bool FLAGS_cld_showme = false; static const bool FLAGS_cld_echotext = true; static const int32 FLAGS_cld_textlimit = 160; static const int32 FLAGS_cld_smoothwidth = 20; static const bool FLAGS_cld_2011_hints = true; static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8; static const bool FLAGS_dbgscore = false; static const int kLangHintInitial = 12; // Boost language by N initially static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram static const int kShortSpanThresh = 32; // Bytes static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing // after this many text bytes static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted static const int kMaxSpaceScan = 32; // Bytes static const int kGoodLang1Percent = 70; static const int kGoodLang1and2Percent = 93; static const int kShortTextThresh = 256; // Bytes static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads static const int kDefaultWordSpan = 256; // Scan at least this many initial // bytes with word scoring static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable static const int kPredictionTableSize = 4096; // Must be exactly 4096 for // cheap compressor static const int kNonEnBoilerplateMinPercent = 17; // no second static const int kNonFIGSBoilerplateMinPercent = 20; // no second static const int kGoodFirstMinPercent = 26; // UNK static const int kGoodFirstReliableMinPercent = 51; // unreli static const int kIgnoreMaxPercent = 20; // >this => unreli static const int kKeepMinPercent = 2; // unreli // Statistically closest language, based on quadgram table // Those that are far from other languges map to UNKNOWN_LANGUAGE // Subscripted by Language // // From lang_correlation.txt and hand-edits // sed 's/^$[^ ]*$ $[^ ]*$ coef=0\.$..$.*$/ // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE, // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt // static const int kMinCorrPercent = 24; // Pick off how close you want // 24 catches PERSIAN <== ARABIC // but not SPANISH <== PORTUGESE static Language Unknown = UNKNOWN_LANGUAGE; // Suspect idea // Subscripted by Language static const Language kClosestAltLanguage[] = { (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE, // SLOVENIAN (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // AKAN ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // IGBO ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MAURITIAN_CREOLE ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // HAWAIIAN }; // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES, // kClosestAltLanguage_has_incorrect_size); inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;} inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;} inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;} inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;} inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;} inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;} inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;} inline bool FlagBestEffort(int flags) { return (flags & kCLDFlagBestEffort) != 0; } // Defines Top40 packed languages // Google top 40 languages // // Tier 0/1 Language enum list (16) // ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH, // E - FIGS // DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN, // PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI, // ARABIC, // // Tier 2 Language enum list (22) // SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN, // HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN, // VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK, // TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN, // UKRAINIAN, HINDI, // // use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21) // // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40 void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) { // REVISIT } void PrintText(FILE* f, Language cur_lang, const string& temp) { if (temp.size() == 0) {return;} fprintf(f, "PrintText[%s]%s
\n", LanguageName(cur_lang), temp.c_str()); } //------------------------------------------------------------------------------ // For --cld_html debugging output. Not thread safe //------------------------------------------------------------------------------ static Language prior_lang = UNKNOWN_LANGUAGE; static bool prior_unreliable = false; //------------------------------------------------------------------------------ // End For --cld_html debugging output //------------------------------------------------------------------------------ // Backscan to word boundary, returning how many bytes n to go back // so that src - n is non-space ans src - n - 1 is space. // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary int BackscanToSpace(const char* src, int limit) { int n = 0; limit = minint(limit, kMaxSpaceScan); while (n < limit) { if (src[-n - 1] == ' ') {return n;} // We are at _X ++n; } n = 0; while (n < limit) { if ((src[-n] & 0xc0) != 0x80) {return n;} // We are at char begin ++n; } return 0; } // Forwardscan to word boundary, returning how many bytes n to go forward // so that src + n is non-space ans src + n - 1 is space. // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary int ForwardscanToSpace(const char* src, int limit) { int n = 0; limit = minint(limit, kMaxSpaceScan); while (n < limit) { if (src[n] == ' ') {return n + 1;} // We are at _X ++n; } n = 0; while (n < limit) { if ((src[n] & 0xc0) != 0x80) {return n;} // We are at char begin ++n; } return 0; } // This uses a cheap predictor to get a measure of compression, and // hence a measure of repetitiveness. It works on complete UTF-8 characters // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly // all the time when done with a byte-based count. Sigh. // // To allow running prediction across multiple chunks, caller passes in current // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. // // Returns the number of *bytes* correctly predicted, increments by 1..4 for // each correctly-predicted character. // // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text // // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) { int p_count = 0; const uint8* src = reinterpret_cast(isrc); const uint8* srclimit = src + src_len; int local_hash = *hash; while (src < srclimit) { int c = src[0]; int incr = 1; // Pick up one char and length if (c < 0xc0) { // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx // Do nothing more } else if ((c & 0xe0) == 0xc0) { // Two-byte c = (c << 8) | src[1]; incr = 2; } else if ((c & 0xf0) == 0xe0) { // Three-byte c = (c << 16) | (src[1] << 8) | src[2]; incr = 3; } else { // Four-byte c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; incr = 4; } src += incr; int p = tbl[local_hash]; // Prediction tbl[local_hash] = c; // Update prediction if (c == p) { p_count += incr; // Count bytes of good predictions } local_hash = ((local_hash << 4) ^ c) & 0xfff; } *hash = local_hash; return p_count; } // Counts number of spaces; a little faster than one-at-a-time // Doesn't count odd bytes at end int CountSpaces4(const char* src, int src_len) { int s_count = 0; for (int i = 0; i < (src_len & ~3); i += 4) { s_count += (src[i] == ' '); s_count += (src[i+1] == ' '); s_count += (src[i+2] == ' '); s_count += (src[i+3] == ' '); } return s_count; } // Remove words of text that have more than half their letters predicted // correctly by our cheap predictor, moving the remaining words in-place // to the front of the input buffer. // // To allow running prediction across multiple chunks, caller passes in current // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. // // Return the new, possibly-shorter length // // Result Buffer ALWAYS has leading space and trailing space space space NUL, // if input does // int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) { const uint8* src = reinterpret_cast(isrc); const uint8* srclimit = src + src_len; char* dst = isrc; int local_hash = *hash; char* word_dst = dst; // Start of next word int good_predict_bytes = 0; int word_length_bytes = 0; while (src < srclimit) { int c = src[0]; int incr = 1; *dst++ = c; if (c == ' ') { if ((good_predict_bytes * 2) > word_length_bytes) { // Word is well-predicted: backup to start of this word dst = word_dst; if (FLAGS_cld_showme) { // Mark the deletion point with period // Don't repeat multiple periods // Cannot mark with more bytes or may overwrite unseen input if ((isrc < (dst - 2)) && (dst[-2] != '.')) { *dst++ = '.'; *dst++ = ' '; } } } word_dst = dst; // Start of next word good_predict_bytes = 0; word_length_bytes = 0; } // Pick up one char and length if (c < 0xc0) { // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx // Do nothing more } else if ((c & 0xe0) == 0xc0) { // Two-byte *dst++ = src[1]; c = (c << 8) | src[1]; incr = 2; } else if ((c & 0xf0) == 0xe0) { // Three-byte *dst++ = src[1]; *dst++ = src[2]; c = (c << 16) | (src[1] << 8) | src[2]; incr = 3; } else { // Four-byte *dst++ = src[1]; *dst++ = src[2]; *dst++ = src[3]; c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; incr = 4; } src += incr; word_length_bytes += incr; int p = tbl[local_hash]; // Prediction tbl[local_hash] = c; // Update prediction if (c == p) { good_predict_bytes += incr; // Count good predictions } local_hash = ((local_hash << 4) ^ c) & 0xfff; } *hash = local_hash; if ((dst - isrc) < (src_len - 3)) { // Pad and make last char clean UTF-8 by putting following spaces dst[0] = ' '; dst[1] = ' '; dst[2] = ' '; dst[3] = '\0'; } else if ((dst - isrc) < src_len) { // Make last char clean UTF-8 by putting following space off the end dst[0] = ' '; } return static_cast(dst - isrc); } // This alternate form overwrites redundant words, thus avoiding corrupting the // backmap for generating a vector of original-text ranges. int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) { const uint8* src = reinterpret_cast(isrc); const uint8* srclimit = src + src_len; char* dst = isrc; int local_hash = *hash; char* word_dst = dst; // Start of next word int good_predict_bytes = 0; int word_length_bytes = 0; while (src < srclimit) { int c = src[0]; int incr = 1; *dst++ = c; if (c == ' ') { if ((good_predict_bytes * 2) > word_length_bytes) { // Word [word_dst..dst-1) is well-predicted: overwrite for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';} } word_dst = dst; // Start of next word good_predict_bytes = 0; word_length_bytes = 0; } // Pick up one char and length if (c < 0xc0) { // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx // Do nothing more } else if ((c & 0xe0) == 0xc0) { // Two-byte *dst++ = src[1]; c = (c << 8) | src[1]; incr = 2; } else if ((c & 0xf0) == 0xe0) { // Three-byte *dst++ = src[1]; *dst++ = src[2]; c = (c << 16) | (src[1] << 8) | src[2]; incr = 3; } else { // Four-byte *dst++ = src[1]; *dst++ = src[2]; *dst++ = src[3]; c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; incr = 4; } src += incr; word_length_bytes += incr; int p = tbl[local_hash]; // Prediction tbl[local_hash] = c; // Update prediction if (c == p) { good_predict_bytes += incr; // Count good predictions } local_hash = ((local_hash << 4) ^ c) & 0xfff; } *hash = local_hash; if ((dst - isrc) < (src_len - 3)) { // Pad and make last char clean UTF-8 by putting following spaces dst[0] = ' '; dst[1] = ' '; dst[2] = ' '; dst[3] = '\0'; } else if ((dst - isrc) < src_len) { // Make last char clean UTF-8 by putting following space off the end dst[0] = ' '; } return static_cast(dst - isrc); } // Remove portions of text that have a high density of spaces, or that are // overly repetitive, squeezing the remaining text in-place to the front of the // input buffer. // // Squeezing looks at density of space/prediced chars in fixed-size chunks, // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes. // // Return the new, possibly-shorter length // // Result Buffer ALWAYS has leading space and trailing space space space NUL, // if input does // int CheapSqueezeInplace(char* isrc, int src_len, int ichunksize) { char* src = isrc; char* dst = src; char* srclimit = src + src_len; bool skipping = false; int hash = 0; // Allocate local prediction table. int* predict_tbl = new int[kPredictionTableSize]; memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); int chunksize = ichunksize; if (chunksize == 0) {chunksize = kChunksizeDefault;} int space_thresh = (chunksize * kSpacesThreshPercent) / 100; int predict_thresh = (chunksize * kPredictThreshPercent) / 100; while (src < srclimit) { int remaining_bytes = srclimit - src; int len = minint(chunksize, remaining_bytes); // Make len land us on a UTF-8 character boundary. // Ah. Also fixes mispredict because we could get out of phase // Loop always terminates at trailing space in buffer while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes int space_n = CountSpaces4(src, len); int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { // Skip the text if (!skipping) { // Keeping-to-skipping transition; do it at a space int n = BackscanToSpace(dst, static_cast(dst - isrc)); dst -= n; if (dst == isrc) { // Force a leading space if the first chunk is deleted *dst++ = ' '; } if (FLAGS_cld_showme) { // Mark the deletion point with black square U+25A0 *dst++ = static_cast(0xe2); *dst++ = static_cast(0x96); *dst++ = static_cast(0xa0); *dst++ = ' '; } skipping = true; } } else { // Keep the text if (skipping) { // Skipping-to-keeping transition; do it at a space int n = ForwardscanToSpace(src, len); src += n; remaining_bytes -= n; // Shrink remaining length len -= n; skipping = false; } // "len" can be negative in some cases if (len > 0) { memmove(dst, src, len); dst += len; } } src += len; } if ((dst - isrc) < (src_len - 3)) { // Pad and make last char clean UTF-8 by putting following spaces dst[0] = ' '; dst[1] = ' '; dst[2] = ' '; dst[3] = '\0'; } else if ((dst - isrc) < src_len) { // Make last char clean UTF-8 by putting following space off the end dst[0] = ' '; } // Deallocate local prediction table delete[] predict_tbl; return static_cast(dst - isrc); } // This alternate form overwrites redundant words, thus avoiding corrupting the // backmap for generating a vector of original-text ranges. int CheapSqueezeInplaceOverwrite(char* isrc, int src_len, int ichunksize) { char* src = isrc; char* dst = src; char* srclimit = src + src_len; bool skipping = false; int hash = 0; // Allocate local prediction table. int* predict_tbl = new int[kPredictionTableSize]; memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); int chunksize = ichunksize; if (chunksize == 0) {chunksize = kChunksizeDefault;} int space_thresh = (chunksize * kSpacesThreshPercent) / 100; int predict_thresh = (chunksize * kPredictThreshPercent) / 100; // Always keep first byte (space) ++src; ++dst; while (src < srclimit) { int remaining_bytes = srclimit - src; int len = minint(chunksize, remaining_bytes); // Make len land us on a UTF-8 character boundary. // Ah. Also fixes mispredict because we could get out of phase // Loop always terminates at trailing space in buffer while ((src[len] & 0xc0) == 0x80) {++len;} // Move past continuation bytes int space_n = CountSpaces4(src, len); int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { // Overwrite the text [dst-n..dst) if (!skipping) { // Keeping-to-skipping transition; do it at a space int n = BackscanToSpace(dst, static_cast(dst - isrc)); // Text [word_dst..dst) is well-predicted: overwrite for (char* p = dst - n; p < dst; ++p) {*p = '.';} skipping = true; } // Overwrite the text [dst..dst+len) for (char* p = dst; p < dst + len; ++p) {*p = '.';} dst[len - 1] = ' '; // Space at end so we can see what is happening } else { // Keep the text if (skipping) { // Skipping-to-keeping transition; do it at a space int n = ForwardscanToSpace(src, len); // Text [dst..dst+n) is well-predicted: overwrite for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';} skipping = false; } } dst += len; src += len; } if ((dst - isrc) < (src_len - 3)) { // Pad and make last char clean UTF-8 by putting following spaces dst[0] = ' '; dst[1] = ' '; dst[2] = ' '; dst[3] = '\0'; } else if ((dst - isrc) < src_len) { // Make last char clean UTF-8 by putting following space off the end dst[0] = ' '; } // Deallocate local prediction table delete[] predict_tbl; return static_cast(dst - isrc); } // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096 // Just CountSpaces is about 340 MB/sec // Byte-only CountPredictedBytes is about 150 MB/sec // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c // Unjammed byte-only both = 170 MB/sec // Jammed byte-only both = 120 MB/sec // Back to original w/slight updates, 110 MB/sec // bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) { // Don't trigger at all on short text if (src_len < testsize) {return false;} int space_thresh = (testsize * kSpacesTriggerPercent) / 100; int predict_thresh = (testsize * kPredictTriggerPercent) / 100; int hash = 0; // Allocate local prediction table. int* predict_tbl = new int[kPredictionTableSize]; memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); bool retval = false; if ((CountSpaces4(src, testsize) >= space_thresh) || (CountPredictedBytes(src, testsize, &hash, predict_tbl) >= predict_thresh)) { retval = true; } // Deallocate local prediction table delete[] predict_tbl; return retval; } // Delete any extended languages from doc_tote void RemoveExtendedLanguages(DocTote* doc_tote) { // Now a nop } static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this // For Tier3 languages, require a minimum number of bytes to be first-place lang static const int kGoodFirstT3MinBytes = 24; // no first // Move bytes for unreliable langs to another lang or UNKNOWN // doc_tote is sorted, so cannot Add // // If both CHINESE and CHINESET are present and unreliable, do not delete both; // merge both into CHINESE. // //dsites 2009.03.19 // we also want to remove Tier3 languages as the first lang if there is very // little text like ej1 ej2 ej3 ej4 // maybe fold this back in earlier // void RemoveUnreliableLanguages(DocTote* doc_tote, bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { // Prepass to merge some low-reliablility languages // TODO: this shouldn't really reach in to the internal structure of doc_tote int total_bytes = 0; for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { int plang = doc_tote->Key(sub); if (plang == DocTote::kUnusedKey) {continue;} // Empty slot Language lang = static_cast(plang); int bytes = doc_tote->Value(sub); int reli = doc_tote->Reliability(sub); if (bytes == 0) {continue;} // Zero bytes total_bytes += bytes; // Reliable percent = stored reliable score over stored bytecount int reliable_percent = reli / bytes; if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper // This language is too unreliable to keep, but we might merge it. Language altlang = UNKNOWN_LANGUAGE; if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];} if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative // Look for alternative in doc_tote int altsub = doc_tote->Find(altlang); if (altsub < 0) {continue;} // No alternative text int bytes2 = doc_tote->Value(altsub); int reli2 = doc_tote->Reliability(altsub); if (bytes2 == 0) {continue;} // Zero bytes // Reliable percent is stored reliable score over stored bytecount int reliable_percent2 = reli2 / bytes2; // Merge one language into the other. Break ties toward lower lang # int tosub = altsub; int fromsub = sub; bool into_lang = false; if ((reliable_percent2 < reliable_percent) || ((reliable_percent2 == reliable_percent) && (lang < altlang))) { tosub = sub; fromsub = altsub; into_lang = true; } // Make sure merged reliability doesn't drop and is enough to avoid delete int newpercent = maxint(reliable_percent, reliable_percent2); newpercent = maxint(newpercent, kMinReliableKeepPercent); int newbytes = bytes + bytes2; int newreli = newpercent * newbytes; doc_tote->SetKey(fromsub, DocTote::kUnusedKey); doc_tote->SetScore(fromsub, 0); doc_tote->SetReliability(fromsub, 0); doc_tote->SetScore(tosub, newbytes); doc_tote->SetReliability(tosub, newreli); // Show fate of unreliable languages if at least 10 bytes if (FLAGS_cld2_html && (newbytes >= 10) && !FLAGS_cld2_quiet) { if (into_lang) { fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", LanguageCode(altlang), reliable_percent2, bytes2, LanguageCode(lang)); } else { fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ", LanguageCode(lang), reliable_percent, bytes, LanguageCode(altlang)); } } } // Pass to delete any remaining unreliable languages for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { int plang = doc_tote->Key(sub); if (plang == DocTote::kUnusedKey) {continue;} // Empty slot Language lang = static_cast(plang); int bytes = doc_tote->Value(sub); int reli = doc_tote->Reliability(sub); if (bytes == 0) {continue;} // Zero bytes // Reliable percent is stored as reliable score over stored bytecount int reliable_percent = reli / bytes; if (reliable_percent >= kMinReliableKeepPercent) { // Keeper? continue; // yes } // Delete unreliable entry doc_tote->SetKey(sub, DocTote::kUnusedKey); doc_tote->SetScore(sub, 0); doc_tote->SetReliability(sub, 0); // Show fate of unreliable languages if at least 10 bytes if (FLAGS_cld2_html && (bytes >= 10) && !FLAGS_cld2_quiet) { fprintf(stderr, "{Unreli %s.%dR,%dB} ", LanguageCode(lang), reliable_percent, bytes); } } ////if (FLAGS_cld2_html) {fprintf(stderr, "
\n");} } // Move all the text bytes from lower byte-count to higher one void MoveLang1ToLang2(Language lang1, Language lang2, int lang1_sub, int lang2_sub, DocTote* doc_tote, ResultChunkVector* resultchunkvector) { // In doc_tote, move all the bytes lang1 => lang2 int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub); doc_tote->SetValue(lang2_sub, sum); sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub); doc_tote->SetScore(lang2_sub, sum); sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub); doc_tote->SetReliability(lang2_sub, sum); // Delete old entry doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey); doc_tote->SetScore(lang1_sub, 0); doc_tote->SetReliability(lang1_sub, 0); // In resultchunkvector, move all the bytes lang1 => lang2 if (resultchunkvector == NULL) {return;} int k = 0; uint16 prior_lang = UNKNOWN_LANGUAGE; for (int i = 0; i < static_cast(resultchunkvector->size()); ++i) { ResultChunk* rc = &(*resultchunkvector)[i]; if (rc->lang1 == lang1) { // Update entry[i] lang1 => lang2 rc->lang1 = lang2; } // One change may produce two merges -- entry before and entry after if ((rc->lang1 == prior_lang) && (k > 0)) { // Merge with previous, deleting entry[i] ResultChunk* prior_rc = &(*resultchunkvector)[k - 1]; prior_rc->bytes += rc->bytes; // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]
\n", i, k-1); } else { // Keep entry[i] (*resultchunkvector)[k] = (*resultchunkvector)[i]; // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]
\n", i, k); ++k; } prior_lang = rc->lang1; } resultchunkvector->resize(k); } // Move less likely byte count to more likely for close pairs of languages // If given, also update resultchunkvector void RefineScoredClosePairs(DocTote* doc_tote, ResultChunkVector* resultchunkvector, bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) { for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { int close_packedlang = doc_tote->Key(sub); int subscr = LanguageCloseSet(static_cast(close_packedlang)); if (subscr == 0) {continue;} // We have a close pair language -- if the other one is also scored and the // longword score differs enough, put all our eggs into one basket // Nonzero longword score: Go look for the other of this pair for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) { if (LanguageCloseSet(static_cast(doc_tote->Key(sub2))) == subscr) { // We have a matching pair int close_packedlang2 = doc_tote->Key(sub2); // Move all the text bytes from lower byte-count to higher one int from_sub, to_sub; Language from_lang, to_lang; if (doc_tote->Value(sub) < doc_tote->Value(sub2)) { from_sub = sub; to_sub = sub2; from_lang = static_cast(close_packedlang); to_lang = static_cast(close_packedlang2); } else { from_sub = sub2; to_sub = sub; from_lang = static_cast(close_packedlang2); to_lang = static_cast(close_packedlang); } if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { // Show fate of closepair language int val = doc_tote->Value(from_sub); // byte count int reli = doc_tote->Reliability(from_sub); int reliable_percent = reli / (val ? val : 1); // avoid zdiv fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}
\n", LanguageCode(from_lang), reliable_percent, doc_tote->Value(from_sub), LanguageCode(to_lang)); } MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub, doc_tote, resultchunkvector); break; // Exit inner for sub2 loop } } // End for sub2 } // End for sub } void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams, uint8* lang_hint_boost) { } void PrintHtmlEscapedText(FILE* f, const char* txt, int len) { string temp(txt, len); fprintf(f, "%s", GetHtmlEscapedText(temp).c_str()); } void PrintLang(FILE* f, Tote* chunk_tote, Language cur_lang, bool cur_unreliable, Language prior_lang, bool prior_unreliable) { if (cur_lang == prior_lang) { fprintf(f, "[]"); } else { fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : ""); } } void PrintTopLang(Language top_lang) { if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { fprintf(stderr, "[] "); } else { fprintf(stderr, "[%s] ", LanguageName(top_lang)); prior_lang = top_lang; } } void PrintTopLangSpeculative(Language top_lang) { fprintf(stderr, "", 0xa0a0a0); if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { fprintf(stderr, "[] "); } else { fprintf(stderr, "[%s] ", LanguageName(top_lang)); prior_lang = top_lang; } fprintf(stderr, "\n"); } void PrintLangs(FILE* f, const Language* language3, const int* percent3, const int* text_bytes, const bool* is_reliable) { fprintf(f, "
  Initial_Languages "); if (language3[0] != UNKNOWN_LANGUAGE) { fprintf(f, "%s%s(%d%%) ", LanguageName(language3[0]), *is_reliable ? "" : "*", percent3[0]); } if (language3[1] != UNKNOWN_LANGUAGE) { fprintf(f, "%s(%d%%) ", LanguageName(language3[1]), percent3[1]); } if (language3[2] != UNKNOWN_LANGUAGE) { fprintf(f, "%s(%d%%) ", LanguageName(language3[2]), percent3[2]); } fprintf(f, "%d bytes \n", *text_bytes); fprintf(f, "
\n"); } // Return internal probability score (sum) per 1024 bytes double GetNormalizedScore(Language lang, ULScript ulscript, int bytecount, int score) { if (bytecount <= 0) {return 0.0;} return (score << 10) / bytecount; } // Extract return values before fixups void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes, int* reliable_percent3, Language* language3, int* percent3, double* normalized_score3, int* text_bytes, bool* is_reliable) { reliable_percent3[0] = 0; reliable_percent3[1] = 0; reliable_percent3[2] = 0; language3[0] = UNKNOWN_LANGUAGE; language3[1] = UNKNOWN_LANGUAGE; language3[2] = UNKNOWN_LANGUAGE; percent3[0] = 0; percent3[1] = 0; percent3[2] = 0; normalized_score3[0] = 0.0; normalized_score3[1] = 0.0; normalized_score3[2] = 0.0; *text_bytes = total_text_bytes; *is_reliable = false; int bytecount1 = 0; int bytecount2 = 0; int bytecount3 = 0; int lang1 = doc_tote->Key(0); if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { // We have a top language language3[0] = static_cast(lang1); bytecount1 = doc_tote->Value(0); int reli1 = doc_tote->Reliability(0); reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv normalized_score3[0] = GetNormalizedScore(language3[0], ULScript_Common, bytecount1, doc_tote->Score(0)); } int lang2 = doc_tote->Key(1); if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) { language3[1] = static_cast(lang2); bytecount2 = doc_tote->Value(1); int reli2 = doc_tote->Reliability(1); reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv normalized_score3[1] = GetNormalizedScore(language3[1], ULScript_Common, bytecount2, doc_tote->Score(1)); } int lang3 = doc_tote->Key(2); if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) { language3[2] = static_cast(lang3); bytecount3 = doc_tote->Value(2); int reli3 = doc_tote->Reliability(2); reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv normalized_score3[2] = GetNormalizedScore(language3[2], ULScript_Common, bytecount3, doc_tote->Score(2)); } // Increase total bytes to sum (top 3) if low for some reason int total_bytecount12 = bytecount1 + bytecount2; int total_bytecount123 = total_bytecount12 + bytecount3; if (total_text_bytes < total_bytecount123) { total_text_bytes = total_bytecount123; *text_bytes = total_text_bytes; } // Sum minus previous % gives better roundoff behavior than bytecount/total int total_text_bytes_div = maxint(1, total_text_bytes); // Avoid zdiv percent3[0] = (bytecount1 * 100) / total_text_bytes_div; percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div; percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div; percent3[2] -= percent3[1]; percent3[1] -= percent3[0]; // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2% // Fix this explicitly if (percent3[1] < percent3[2]) { ++percent3[1]; --percent3[2]; } if (percent3[0] < percent3[1]) { ++percent3[0]; --percent3[1]; } *text_bytes = total_text_bytes; if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) { // We have a top language // Its reliability is overall result reliability int bytecount = doc_tote->Value(0); int reli = doc_tote->Reliability(0); int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv *is_reliable = (reliable_percent >= kMinReliableKeepPercent); } else { // No top language at all. This can happen with zero text or 100% Klingon // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable. *is_reliable = false; } // If ignore percent is too large, set unreliable. int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); if ((ignore_percent > kIgnoreMaxPercent)) { *is_reliable = false; } } bool IsFIGS(Language lang) { if (lang == FRENCH) {return true;} if (lang == ITALIAN) {return true;} if (lang == GERMAN) {return true;} if (lang == SPANISH) {return true;} return false; } bool IsEFIGS(Language lang) { if (lang == ENGLISH) {return true;} if (lang == FRENCH) {return true;} if (lang == ITALIAN) {return true;} if (lang == GERMAN) {return true;} if (lang == SPANISH) {return true;} return false; } // For Tier3 languages, require more bytes of text to override // the first-place language static const int kGoodSecondT1T2MinBytes = 15; // no second static const int kGoodSecondT3MinBytes = 128; // no second // Calculate a single summary language for the document, and its reliability. // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE // This is the heart of matching human-rater perception. // reliable_percent3[] is currently unused // // Do not return Tier3 second language unless there are at least 128 bytes void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes, const int* reliable_percent3, const Language* language3, const int* percent3, Language* summary_lang, bool* is_reliable, bool FLAGS_cld2_html, bool FLAGS_cld2_quiet, int flags) { // Vector of active languages; changes if we delete some int slot_count = 3; int active_slot[3] = {0, 1, 2}; int ignore_percent = 0; int return_percent = percent3[0]; // Default to top lang *summary_lang = language3[0]; *is_reliable = true; if (percent3[0] < kKeepMinPercent) {*is_reliable = false;} // If any of top 3 is IGNORE, remove it and increment ignore_percent for (int i = 0; i < 3; ++i) { if (language3[i] == TG_UNKNOWN_LANGUAGE) { ignore_percent += percent3[i]; // Move the rest up, leaving input vectors unchanged for (int j=i+1; j < 3; ++j) { active_slot[j - 1] = active_slot[j]; } -- slot_count; // Logically remove Ignore from percentage-text calculation // (extra 1 in 101 avoids zdiv, biases slightly small) return_percent = (percent3[0] * 100) / (101 - ignore_percent); *summary_lang = language3[active_slot[0]]; if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;} } } // If English and X, where X (not UNK) is big enough, // assume the English is boilerplate and return X. // Logically remove English from percentage-text calculation int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100; // Require more bytes of text for Tier3 languages int minbytesneeded = kGoodSecondT1T2MinBytes; int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]); if ((language3[active_slot[0]] == ENGLISH) && (language3[active_slot[1]] != ENGLISH) && (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) && (second_bytes >= minbytesneeded)) { ignore_percent += percent3[active_slot[0]]; return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); *summary_lang = language3[active_slot[1]]; if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} // Else If FIGS and X, where X (not UNK, EFIGS) is big enough, // assume the FIGS is boilerplate and return X. // Logically remove FIGS from percentage-text calculation } else if (IsFIGS(language3[active_slot[0]]) && !IsEFIGS(language3[active_slot[1]]) && (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) && (second_bytes >= minbytesneeded)) { ignore_percent += percent3[active_slot[0]]; return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); *summary_lang = language3[active_slot[1]]; if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} // Else we are returning the first language, but want to improve its // return_percent if the second language should be ignored } else if ((language3[active_slot[1]] == ENGLISH) && (language3[active_slot[0]] != ENGLISH)) { ignore_percent += percent3[active_slot[1]]; return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); } else if (IsFIGS(language3[active_slot[1]]) && !IsEFIGS(language3[active_slot[0]])) { ignore_percent += percent3[active_slot[1]]; return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); } // If return percent is too small (too many languages), return UNKNOWN if ((return_percent < kGoodFirstMinPercent) && !FlagBestEffort(flags)) { if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { fprintf(stderr, "{Unreli %s %d%% percent too small} ", LanguageCode(*summary_lang), return_percent); } *summary_lang = UNKNOWN_LANGUAGE; *is_reliable = false; } // If return percent is small, return language but set unreliable. if ((return_percent < kGoodFirstReliableMinPercent)) { *is_reliable = false; } // If ignore percent is too large, set unreliable. ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]); if ((ignore_percent > kIgnoreMaxPercent)) { *is_reliable = false; } // If we removed all the active languages, return UNKNOWN if (slot_count == 0) { if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { fprintf(stderr, "{Unreli %s no languages left} ", LanguageCode(*summary_lang)); } *summary_lang = UNKNOWN_LANGUAGE; *is_reliable = false; } } void AddLangPriorBoost(Language lang, uint32 langprob, ScoringContext* scoringcontext) { // This is called 0..n times with language hints // but we don't know the script -- so boost either or both Latn, Othr. if (IsLatnLanguage(lang)) { LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn; int n = langprior_boost->n; langprior_boost->langprob[n] = langprob; langprior_boost->n = langprior_boost->wrap(n + 1); } if (IsOthrLanguage(lang)) { LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr; int n = langprior_boost->n; langprior_boost->langprob[n] = langprob; langprior_boost->n = langprior_boost->wrap(n + 1); } } void AddOneWhack(Language whacker_lang, Language whackee_lang, ScoringContext* scoringcontext) { uint32 langprob = MakeLangProb(whackee_lang, 1); // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) { LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn; int n = langprior_whack->n; langprior_whack->langprob[n] = langprob; langprior_whack->n = langprior_whack->wrap(n + 1); } if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) { LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr; int n = langprior_whack->n; langprior_whack->langprob[n] = langprob; langprior_whack->n = langprior_whack->wrap(n + 1); } } void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) { // We do not in general want zh-Hans and zh-Hant to be close pairs, // but we do here. if (lang == CLD2::CHINESE) { AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext); return; } if (lang == CLD2::CHINESE_T) { AddOneWhack(lang, CLD2::CHINESE, scoringcontext); return; } int base_lang_set = LanguageCloseSet(lang); if (base_lang_set == 0) {return;} // TODO: add an explicit list of each set to avoid this 512-times loop for (int i = 0; i < kLanguageToPLangSize; ++i) { Language lang2 = static_cast(i); if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) { AddOneWhack(lang, lang2, scoringcontext); } } } void ApplyHints(const char* buffer, int buffer_length, bool is_plain_text, const CLDHints* cld_hints, ScoringContext* scoringcontext) { CLDLangPriors lang_priors; InitCLDLangPriors(&lang_priors); // We now use lang= tags. // Last look, circa 2008 found only 15% of web pages with lang= tags and // many of those were wrong. Now (July 2011), we find 44% of web pages have // lang= tags, and most of them are correct. So we now give them substantial // weight in each chunk scored. if (!is_plain_text) { // Get any contained language tags in first n KB int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10; string lang_tags = GetLangTagsFromHtml(buffer, buffer_length, max_scan_bytes); SetCLDLangTagsHint(lang_tags, &lang_priors); if (scoringcontext->flags_cld2_html) { if (!lang_tags.empty()) { fprintf(scoringcontext->debug_file, "
lang_tags '%s'
\n", lang_tags.c_str()); } } } if (cld_hints != NULL) { if ((cld_hints->content_language_hint != NULL) && (cld_hints->content_language_hint[0] != '\0')) { SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors); } // Input is from GetTLD(), already lowercased if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) { SetCLDTLDHint(cld_hints->tld_hint, &lang_priors); } if (cld_hints->encoding_hint != UNKNOWN_ENCODING) { Encoding enc = static_cast(cld_hints->encoding_hint); SetCLDEncodingHint(enc, &lang_priors); } if (cld_hints->language_hint != UNKNOWN_LANGUAGE) { SetCLDLanguageHint(cld_hints->language_hint, &lang_priors); } } // Keep no more than four different languages with hints TrimCLDLangPriors(4, &lang_priors); if (scoringcontext->flags_cld2_html) { string print_temp = DumpCLDLangPriors(&lang_priors); if (!print_temp.empty()) { fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s
\n", print_temp.c_str()); } } // Put boosts into ScoringContext for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { Language lang = GetCLDPriorLang(lang_priors.prior[i]); int qprob = GetCLDPriorWeight(lang_priors.prior[i]); if (qprob > 0) { uint32 langprob = MakeLangProb(lang, qprob); AddLangPriorBoost(lang, langprob, scoringcontext); } } // Put whacks into scoring context // We do not in general want zh-Hans and zh-Hant to be close pairs, // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant std::vector close_set_count(kCloseSetSize + 1, 0); for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { Language lang = GetCLDPriorLang(lang_priors.prior[i]); ++close_set_count[LanguageCloseSet(lang)]; if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];} if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];} } // If a boost language is in a close set, force suppressing the others in // that set, if exactly one of the set is present for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) { Language lang = GetCLDPriorLang(lang_priors.prior[i]); int qprob = GetCLDPriorWeight(lang_priors.prior[i]); if (qprob > 0) { int close_set = LanguageCloseSet(lang); if ((close_set > 0) && (close_set_count[close_set] == 1)) { AddCloseLangWhack(lang, scoringcontext); } if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) && (close_set_count[kCloseSetSize] == 1)) { AddCloseLangWhack(lang, scoringcontext); } } } } // Extend results to fully cover the [lo..hi) range void FinishResultVector(int lo, int hi, ResultChunkVector* vec) { if (vec == NULL) {return;} if (vec->size() == 0) {return;} ResultChunk* rc = &(*vec)[0]; if (rc->offset > lo) { int diff = rc->offset - lo; rc->offset -= diff; rc->bytes += diff; } ResultChunk* rc2 = &(*vec)[vec->size() - 1]; int rc2hi = rc2->offset + rc2->bytes; if (rc2hi < hi) { int diff = hi - rc2hi; rc2->bytes += diff; } } // Results language3/percent3/text_bytes must be exactly three items Language DetectLanguageSummaryV2( const char* buffer, int buffer_length, bool is_plain_text, const CLDHints* cld_hints, bool allow_extended_lang, int flags, Language plus_one, Language* language3, int* percent3, double* normalized_score3, ResultChunkVector* resultchunkvector, int* text_bytes, bool* is_reliable) { language3[0] = UNKNOWN_LANGUAGE; language3[1] = UNKNOWN_LANGUAGE; language3[2] = UNKNOWN_LANGUAGE; percent3[0] = 0; percent3[1] = 0; percent3[2] = 0; normalized_score3[0] = 0.0; normalized_score3[1] = 0.0; normalized_score3[2] = 0.0; if (resultchunkvector != NULL) { resultchunkvector->clear(); } *text_bytes = 0; *is_reliable = false; if ((flags & kCLDFlagEcho) != 0) { string temp(buffer, buffer_length); if ((flags & kCLDFlagHtml) != 0) { fprintf(stderr, "CLD2[%d] '%s'
\n", buffer_length, GetHtmlEscapedText(temp).c_str()); } else { fprintf(stderr, "CLD2[%d] '%s'\n", buffer_length, GetPlainEscapedText(temp).c_str()); } } #ifdef CLD2_DYNAMIC_MODE // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file // hasn't been loaded yet. This is the only sane thing we can do, as there // are no scoring tables to consult. bool dataLoaded = isDataLoaded(); if ((flags & kCLDFlagVerbose) != 0) { fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false")); } if (!dataLoaded) { return UNKNOWN_LANGUAGE; } #endif // Exit now if no text if (buffer_length == 0) {return UNKNOWN_LANGUAGE;} if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;} // Document totals DocTote doc_tote; // Reliability = 0..100 // ScoringContext carries state across scriptspans ScoringContext scoringcontext; scoringcontext.debug_file = stderr; scoringcontext.flags_cld2_score_as_quads = ((flags & kCLDFlagScoreAsQuads) != 0); scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0); scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0); scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0); scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE; scoringcontext.ulscript = ULScript_Common; scoringcontext.scoringtables = &kScoringtables; scoringcontext.scanner = NULL; scoringcontext.init(); // Clear the internal memory arrays // Now thread safe. bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0); bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0); ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext); // Four individual script totals, Latin, Han, other2, other3 int next_other_tote = 2; int tote_num = 0; // Four totes for up to four different scripts pending at once Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other bool tote_seen[4] = {false, false, false, false}; int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk ULScript tote_script[4] = {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common}; // Loop through text spans in a single script ScriptScanner ss(buffer, buffer_length, is_plain_text); LangSpan scriptspan; scoringcontext.scanner = &ss; scriptspan.text = NULL; scriptspan.text_bytes = 0; scriptspan.offset = 0; scriptspan.ulscript = ULScript_Common; scriptspan.lang = UNKNOWN_LANGUAGE; int total_text_bytes = 0; int textlimit = FLAGS_cld_textlimit << 10; // in KB if (textlimit == 0) {textlimit = 0x7fffffff;} int advance_by = 2; // Advance 2 bytes int advance_limit = textlimit >> 3; // For first 1/8 of max document int initial_word_span = kDefaultWordSpan; if (FLAGS_cld_forcewords) { initial_word_span = kReallyBigWordSpan; } // Pick up chunk sizes // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each // Sanity check -- force into a reasonable range int chunksizequads = FLAGS_cld_smoothwidth; chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads), kMaxChunkSizeQuads); int chunksizeunis = (chunksizequads * 5) >> 1; // Varying short-span limit doesn't work well -- skips too much beyond 20KB // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth; int spantooshortlimit = kShortSpanThresh; // For debugging only. Not thread-safe prior_lang = UNKNOWN_LANGUAGE; prior_unreliable = false; // Allocate full-document prediction table for finding repeating words int hash = 0; int* predict_tbl = new int[kPredictionTableSize]; if (FlagRepeats(flags)) { memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); } // Loop through scriptspans accumulating number of text bytes in each language while (ss.GetOneScriptSpanLower(&scriptspan)) { ULScript ulscript = scriptspan.ulscript; // Squeeze out big chunks of text span if asked to if (FlagSqueeze(flags)) { // Remove repetitive or mostly-spaces chunks int newlen; int chunksize = 0; // Use the default if (resultchunkvector != NULL) { newlen = CheapSqueezeInplaceOverwrite(scriptspan.text, scriptspan.text_bytes, chunksize); } else { newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes, chunksize); } scriptspan.text_bytes = newlen; } else { // Check now and then to see if we should be squeezing if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) && !FlagFinish(flags)) { // fprintf(stderr, "CheapSqueezeTriggerTest, " // "first %d bytes of %d (>%d/2)
\n", // kCheapSqueezeTestLen, // scriptspan.text_bytes, // kCheapSqueezeTestThresh); if (CheapSqueezeTriggerTest(scriptspan.text, scriptspan.text_bytes, kCheapSqueezeTestLen)) { // Recursive call with big-chunk squeezing set if (FLAGS_cld2_html || FLAGS_dbgscore) { fprintf(stderr, "
---text_bytes[%d] Recursive(Squeeze)---

\n", total_text_bytes); } // Deallocate full-document prediction table delete[] predict_tbl; return DetectLanguageSummaryV2( buffer, buffer_length, is_plain_text, cld_hints, allow_extended_lang, flags | kCLDFlagSqueeze, plus_one, language3, percent3, normalized_score3, resultchunkvector, text_bytes, is_reliable); } } } // Remove repetitive words if asked to if (FlagRepeats(flags)) { // Remove repetitive words int newlen; if (resultchunkvector != NULL) { newlen = CheapRepWordsInplaceOverwrite(scriptspan.text, scriptspan.text_bytes, &hash, predict_tbl); } else { newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes, &hash, predict_tbl); } scriptspan.text_bytes = newlen; } // Scoring depends on scriptspan buffer ALWAYS having // leading space and off-the-end space space space NUL, // DCHECK(scriptspan.text[0] == ' '); // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' '); // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' '); // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' '); // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0'); // The real scoring // Accumulate directly into the document total, or accmulate in one of four // chunk totals. The purpose of the multiple chunk totals is to piece // together short choppy pieces of text in alternating scripts. One total is // dedicated to Latin text, one to Han text, and the other two are dynamicly // assigned. scoringcontext.ulscript = scriptspan.ulscript; // FLAGS_cld2_html = scoringcontext.flags_cld2_html; ScoreOneScriptSpan(scriptspan, &scoringcontext, &doc_tote, resultchunkvector); total_text_bytes += scriptspan.text_bytes; } // End while (ss.GetOneScriptSpanLower()) // Deallocate full-document prediction table delete[] predict_tbl; if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { // If no forced , put one in front of dump if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "
\n");} doc_tote.Dump(stderr); } // If extended langauges are disallowed, remove them here if (!allow_extended_lang) { RemoveExtendedLanguages(&doc_tote); } // Force close pairs to one or the other // If given, also update resultchunkvector RefineScoredClosePairs(&doc_tote, resultchunkvector, FLAGS_cld2_html, FLAGS_cld2_quiet); // Calculate return results // Find top three byte counts in tote heap int reliable_percent3[3]; // Cannot use Add, etc. after sorting doc_tote.Sort(3); ExtractLangEtc(&doc_tote, total_text_bytes, reliable_percent3, language3, percent3, normalized_score3, text_bytes, is_reliable); bool have_good_answer = false; if (FlagFinish(flags)) { // Force a result have_good_answer = true; } else if (total_text_bytes <= kShortTextThresh) { // Don't recurse on short text -- we already did word scores have_good_answer = true; } else if (*is_reliable && (percent3[0] >= kGoodLang1Percent)) { have_good_answer = true; } else if (*is_reliable && ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) { have_good_answer = true; } if (have_good_answer) { // This is the real, non-recursive return // Move bytes for unreliable langs to another lang or UNKNOWN if (!FlagBestEffort(flags)) { RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet); } // Redo the result extraction after the removal above doc_tote.Sort(3); ExtractLangEtc(&doc_tote, total_text_bytes, reliable_percent3, language3, percent3, normalized_score3, text_bytes, is_reliable); Language summary_lang; CalcSummaryLang(&doc_tote, total_text_bytes, reliable_percent3, language3, percent3, &summary_lang, is_reliable, FLAGS_cld2_html, FLAGS_cld2_quiet, flags); if (FLAGS_cld2_html && !FLAGS_cld2_quiet) { for (int i = 0; i < 3; ++i) { if (language3[i] != UNKNOWN_LANGUAGE) { fprintf(stderr, "%s.%dR(%d%%) ", LanguageCode(language3[i]), reliable_percent3[i], percent3[i]); } } fprintf(stderr, "%d bytes ", total_text_bytes); fprintf(stderr, "= %s%c ", LanguageName(summary_lang), *is_reliable ? ' ' : '*'); fprintf(stderr, "

\n"); } // Slightly condensed if quiet if (FLAGS_cld2_html && FLAGS_cld2_quiet) { fprintf(stderr, "       "); for (int i = 0; i < 3; ++i) { if (language3[i] != UNKNOWN_LANGUAGE) { fprintf(stderr, "  %s %d%% ", LanguageCode(language3[i]), percent3[i]); } } fprintf(stderr, "= %s%c ", LanguageName(summary_lang), *is_reliable ? ' ' : '*'); fprintf(stderr, "
\n"); } // Extend results to fully cover the input buffer FinishResultVector(0, buffer_length, resultchunkvector); return summary_lang; } // Not a good answer -- do recursive call to refine if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) { // This is what we hope to improve on in the recursive call, if any PrintLangs(stderr, language3, percent3, text_bytes, is_reliable); } // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40 // For this purpose, we treate "Ignore" as top40 Language new_plus_one = UNKNOWN_LANGUAGE; if (total_text_bytes < kShortTextThresh) { // Short text: Recursive call with top40 and short set if (FLAGS_cld2_html || FLAGS_dbgscore) { fprintf(stderr, "  ---text_bytes[%d] " "Recursive(Top40/Rep/Short/Words)---

\n", total_text_bytes); } return DetectLanguageSummaryV2( buffer, buffer_length, is_plain_text, cld_hints, allow_extended_lang, flags | kCLDFlagTop40 | kCLDFlagRepeats | kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish, new_plus_one, language3, percent3, normalized_score3, resultchunkvector, text_bytes, is_reliable); } // Longer text: Recursive call with top40 set if (FLAGS_cld2_html || FLAGS_dbgscore) { fprintf(stderr, "  ---text_bytes[%d] Recursive(Top40/Rep)---

\n", total_text_bytes); } return DetectLanguageSummaryV2( buffer, buffer_length, is_plain_text, cld_hints, allow_extended_lang, flags | kCLDFlagTop40 | kCLDFlagRepeats | kCLDFlagFinish, new_plus_one, language3, percent3, normalized_score3, resultchunkvector, text_bytes, is_reliable); } // For debugging and wrappers. Not thread safe. static char temp_detectlanguageversion[32]; // Return version text string // String is "code_version - data_build_date" const char* DetectLanguageVersion() { if (kScoringtables.quadgram_obj == NULL) {return "";} sprintf(temp_detectlanguageversion, "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate); return temp_detectlanguageversion; } } // End namespace CLD2