"); AppendString(title()); AppendString( "

\n"; std::unique_ptr res_it(GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); continue; } // Open any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { para_is_ltr = true; // reset to default direction hocr_str << "

IsAtBeginningOf(RIL_PARA)) { hocr_str << "\n

ParagraphIsLtr(); if (!para_is_ltr) { hocr_str << " dir='rtl'"; } hocr_str << " id='" << "par_" << page_id << "_" << pcnt << "'"; paragraph_lang = res_it->WordRecognitionLanguage(); if (paragraph_lang) { hocr_str << " lang='" << paragraph_lang << "'"; } AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str); } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { hocr_str << "\n >>* choiceMap = nullptr; if (tesseract_->lstm_choice_mode) { choiceMap = res_it->GetBestLSTMSymbolChoices(); } hocr_str << "\n BoundingBox(RIL_WORD, &left, &top, &right, &bottom); font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); hocr_str << " title='bbox " << left << " " << top << " " << right << " " << bottom << "; x_wconf " << static_cast(res_it->Confidence(RIL_WORD)); if (font_info) { if (font_name) { hocr_str << "; x_font " << HOcrEscape(font_name).c_str(); } hocr_str << "; x_fsize " << pointsize; } hocr_str << "'"; const char* lang = res_it->WordRecognitionLanguage(); if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { hocr_str << " lang='" << lang << "'"; } switch (res_it->WordDirection()) { // Only emit direction if different from current paragraph direction case DIR_LEFT_TO_RIGHT: if (!para_is_ltr) hocr_str << " dir='ltr'"; break; case DIR_RIGHT_TO_LEFT: if (para_is_ltr) hocr_str << " dir='rtl'"; break; case DIR_MIX: case DIR_NEUTRAL: default: // Do nothing. break; } hocr_str << ">"; bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); if (bold) hocr_str << ""; if (italic) hocr_str << ""; do { const std::unique_ptr grapheme( res_it->GetUTF8Text(RIL_SYMBOL)); if (grapheme && grapheme[0] != 0) { if (hocr_boxes) { res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); hocr_str << "\n "; } hocr_str << HOcrEscape(grapheme.get()).c_str(); if (hocr_boxes) { hocr_str << ""; } } res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); if (italic) hocr_str << ""; if (bold) hocr_str << ""; // If the lstm choice mode is required it is added here if (tesseract_->lstm_choice_mode == 1 && choiceMap != nullptr) { for (auto timestep : *choiceMap) { hocr_str << "\n "; for (std::pair conf : timestep) { hocr_str << "" << conf.first << ""; gcnt++; } hocr_str << ""; tcnt++; } } else if (tesseract_->lstm_choice_mode == 2 && choiceMap != nullptr) { for (auto timestep : *choiceMap) { if (timestep.size() > 0) { hocr_str << "\n "; for (auto & j : timestep) { hocr_str << "" << j.first << ""; gcnt++; } hocr_str << ""; tcnt++; } } } // Close ocrx_word. if (hocr_boxes || tesseract_->lstm_choice_mode > 0) { hocr_str << "\n "; } hocr_str << ""; tcnt = 1; gcnt = 1; wcnt++; // Close any ending block/paragraph/textline. if (last_word_in_line) { hocr_str << "\n "; lcnt++; } if (last_word_in_para) { hocr_str << "\n

\n"; pcnt++; para_is_ltr = true; // back to default direction } if (last_word_in_block) { hocr_str << "

\n"; bcnt++; } } hocr_str << "