/** * DOCX Citations importer * * Extracts bibliographic citations from the XML of a DOCX file's * word/document.xml (or any XML fragment containing the same markup). * * Supported citation manager formats and how each is handled: * * - Word native `CITATION key \l locale` inline field + * `customXml/item1.xml` sources (passed as * `sourcesXml` option). Delegated to * DocxNativeParser in docx-native.ts. * * - Zotero `ADDIN ZOTERO_ITEM CSL_CITATION {json}` inline * field. The embedded CSL-JSON `citationItems` * array is reshaped into a Record * and fed to CSLParser. * * - Mendeley Cite v3 Current add-in: `` with the citation JSON * (current) base64-encoded in `w:tag w:val= * "MENDELEY_CITATION_v3_{base64}"`. Decoded payload * is CSL-JSON; handled identically to Zotero. * * - Mendeley Desktop Legacy add-in: `ADDIN CSL_CITATION {json}` inline * (legacy) field. Same CSL-JSON shape; handled identically * to Zotero. * * - EndNote `ADDIN EN.CITE …` — XML is * either entity-escaped inline or base64-encoded in * ``. The `` subtree is * converted to an EndNoteRecord object and passed to * EndNoteParser. * * - Citavi `` wrapping `ADDIN CitaviPlaceholder * {base64}`. The base64-decoded JSON contains a * WordPlaceholder with `Entries[].ReferenceId` * UUIDs. Two sub-cases: * * A. If the payload already embeds `Reference` * objects (some Citavi export modes), they are * passed directly to CitaviParser. * * B. In older or incomplete formats, only UUIDs may * be present without embedded references. Such * citations cannot be fully resolved and will * generate warnings. * * Usage: * const parser = new DocxCitationsParser(documentXml, { * sourcesXml, // contents of customXml/item1.xml (Word-native) * }) * const result = parser.parse() * // result.entries → BibDB (Record) * // result.errors → ErrorObject[] * // result.warnings → ErrorObject[] * * The `sourcesXml` option must be the contents of `customXml/item1.xml` from * the DOCX ZIP when Word-native citations are present. * * Citavi citations embed complete bibliographic data directly in each citation * field, so no external Citavi project file is required. */ import type { EntryObject } from "../const" import { type CitaviInput, CitaviParser } from "./citavi" import { type CSLEntry, CSLParser } from "./csl" import { DocxNativeParser } from "./docx-native" import { EndNoteParser, type EndNoteRecord } from "./endnote" import { extractJsonObject } from "./tools" // --------------------------------------------------------------------------- // Public result types // --------------------------------------------------------------------------- export interface DocxCitationsParseResult { entries: Record errors: ErrorObject[] warnings: ErrorObject[] } // --------------------------------------------------------------------------- // Citation accumulator — shared mutable state for multi-element processing // --------------------------------------------------------------------------- /** * Mutable accumulator passed to static extraction methods when processing * multiple document elements in a single pass. All four fields are mutated * in place as entries are discovered and keys are deduplicated. */ export interface CitationAccumulator { entries: EntryObject[] errors: ErrorObject[] warnings: ErrorObject[] seenKeys: Set /** * Persistent map from raw CSL `id` strings to the normalised `entry_key` * values assigned by `CSLParser`. Accumulated across all citation elements * processed with the same accumulator so that duplicate items (already in * `seenKeys`) can still have their metadata resolved to the correct key. */ cslRawIdToEntryKey: Map } interface ErrorObject { type: string field?: string value?: unknown entry?: string } // --------------------------------------------------------------------------- // Citation item metadata // --------------------------------------------------------------------------- /** * Per-entry citation metadata, keyed by `entry_key`. * * This captures the cite-specific decorations that surround a bibliographic * reference inside a single citation: page locators, textual prefixes / * suffixes, and author-rendering flags. It is returned alongside the * `entries` BibDB when `retrieveMetadata` is `true` on a static method call. * * Field availability by format: * * | Field | Zotero | Mendeley | EndNote | Citavi | * |------------------|--------|----------|----------------------|-------------------------------------| * | locator | ✅ | ✅ | ✅ (Pages) | ✅ (PageRange.OriginalString) | * | label | ✅ | ✅ | – | ✅ (derived from PageRange.NumberingType; mapping inferred from manual) | * | prefix | ✅ | ✅ | ✅ | ✅ | * | suffix | ✅ | ✅ | ✅ | ✅ (confirmed by manual; not yet seen in real files) | * | suppressAuthor | ✅ | ✅ | – | – | * | authorOnly | ✅ | ✅ | – | – | * | authorYear | – | – | ✅ (AuthorYear attr) | – | * | bibliographyEntry| – | – | – | ✅ (confirmed by manual; not yet seen in real files) | * | ruleSet | – | – | – | ✅ (confirmed by manual; serialised form unknown) | * | formatOption | – | – | – | ✅ (confirmed by manual; serialised form unknown) | * | insertAs | – | – | – | ✅ (confirmed by manual; serialised form unknown) | */ export interface CitationItemMetadata { /** The `entry_key` of the corresponding entry in the returned `entries` BibDB. */ entry_key: string /** * Pinpoint location within the cited work (page number, chapter, etc.). * For CSL formats this is the raw `locator` string; for EndNote it is the * `` element text; for Citavi it is `PageRange.OriginalString`. */ locator?: string /** * CSL locator type label (e.g. `"page"`, `"chapter"`, `"section"`). * For CSL-based formats (Zotero, Mendeley) this is the raw `label` string * from the citation item. For Citavi it is derived from `PageRange.NumberingType`: * `0` (Pages) → `"page"`, `1` (Columns) → `"column"`, * `2` (Section numbers) → `"section"`, `3` (Margin numbers) → `"note"`, * `4` (Other / free-form) → `"custom"`. * The integer-to-label mapping for Citavi is inferred from the Citavi manual * and has not been confirmed against observed data beyond value `0`. */ label?: string /** Text to prepend to the formatted citation (e.g. `"see "`, `"cf. "`). */ prefix?: string /** Text to append to the formatted citation (e.g. `", etc."`). */ suffix?: string /** * When `true`, author names are suppressed in the formatted output, * leaving only the year (and locator) in parentheses: `(2020, p. 45)`. * Only populated for CSL-based formats (Zotero, Mendeley). */ suppressAuthor?: boolean /** * When `true`, only the author name is rendered with nothing else: * `William T. Williams`. * Only populated for CSL-based formats (Zotero, Mendeley). */ authorOnly?: boolean /** * When `true`, the author name is rendered outside the parentheses while * the year (and locator) remain inside: `William T. Williams (2020, p. 45)`. * This reflects the `AuthorYear="1"` attribute on EndNote's `` element. * Only populated for EndNote citations. */ authorYear?: boolean /** * Controls whether and where this reference appears in the bibliography. * Only populated for Citavi citations (from `Entries[].BibliographyEntry`). * * Known values: * `"/bibonly"` – reference appears only in the bibliography, not in-text * `"/nobib"` – reference appears only in-text, not in the bibliography * * When absent the reference appears in both (default behaviour). * Confirmed by the Citavi manual; not yet observed in real files. */ bibliographyEntry?: string /** * Overrides which citation-style rule set (formatting variant) is used for * this entry. Only populated for Citavi citations (from `Entries[].RuleSet`). * Serialised form not yet observed in real files. */ ruleSet?: unknown /** * Selects among the citation style's optional formatting variants for this * entry (values 1, 2, or 3). Only populated for Citavi citations (from * `Entries[].FormatOption`). Serialised form not yet observed in real files. */ formatOption?: unknown /** * Overrides where the citation is physically inserted (in-text vs. footnote). * Only populated for Citavi citations (from `Entries[].InsertAs`). * Serialised form not yet observed in real files. */ insertAs?: unknown } // --------------------------------------------------------------------------- // Static utility result types // --------------------------------------------------------------------------- export interface CitationResult { isCitation: boolean format?: string // e.g., "zotero", "mendeley_v3", "endnote", "citavi", "word_native" entries?: Record errors?: ErrorObject[] warnings?: ErrorObject[] /** * Per-entry citation metadata (locators, prefixes, suffixes, flags). * Only populated when `retrieveMetadata` is `true` on the static method call. */ metadata?: CitationItemMetadata[] } export interface BibliographyResult { isBibliography: boolean format?: string } // --------------------------------------------------------------------------- // Options // --------------------------------------------------------------------------- export interface DocxCitationsParserOptions { /** * Contents of `customXml/item1.xml` from the DOCX ZIP, using the MS * Office Bibliography XML namespace. Required to resolve Word-native and * `CITATION` keys into full bibliographic data. */ sourcesXml?: string } // --------------------------------------------------------------------------- // Parser class // --------------------------------------------------------------------------- export class DocxCitationsParser { private documentXml: string private options: DocxCitationsParserOptions entries: EntryObject[] errors: ErrorObject[] warnings: ErrorObject[] /** Entry keys already added — prevents duplicates across multiple fields. */ private seenKeys: Set /** Persistent raw CSL id → normalised entry_key map for the instance parse. */ private cslRawIdToEntryKey: Map constructor(documentXml: string, options: DocxCitationsParserOptions = {}) { this.documentXml = documentXml this.options = options this.entries = [] this.errors = [] this.warnings = [] this.seenKeys = new Set() this.cslRawIdToEntryKey = new Map() } // ----------------------------------------------------------------------- // Static utility methods for reusable citation detection and extraction // ----------------------------------------------------------------------- /** * Check if an SDT block contains citation data (without full document parsing). * * @param sdtXml - XML string of a single ... block * @returns CitationCheckResult indicating whether it's a citation and its format */ static sdtCitation( sdtXml: string, retrieve = true, retrieveMetadata = false, acc: CitationAccumulator = { entries: [], errors: [], warnings: [], seenKeys: new Set(), cslRawIdToEntryKey: new Map(), }, ): CitationResult { const { entries, errors, warnings } = acc const tagMatch = sdtXml.match(/ runs. // Always concatenate every instrText fragment inside // before searching for the "ADDIN CitaviPlaceholder" prefix so that // we assemble the complete Base64 payload regardless of how many runs // were used. let b64: string | null = null const sdtContentMatch = sdtXml.match( /]*>([\s\S]*?)<\/w:sdtContent>/, ) if (sdtContentMatch) { const sdtContent = sdtContentMatch[1] const allInstrTexts: string[] = [] const fragRe = /]*>([\s\S]*?)<\/w:instrText>/g let frag: RegExpExecArray | null while ((frag = fragRe.exec(sdtContent)) !== null) { allInstrTexts.push(frag[1]) } const combined = allInstrTexts.join("") const prefixUpper = "ADDIN CITAVIPLACEHOLDER" const upperCombined = combined.toUpperCase() const prefixIdx = upperCombined.indexOf(prefixUpper) if (prefixIdx !== -1) { b64 = combined .slice(prefixIdx + prefixUpper.length) .replace(/\s/g, "") .replace(/^\{/, "") .replace(/\}$/, "") } } if (b64 !== null && b64.length > 0) { DocxCitationsParser.extractCitaviData( b64, acc, retrieveMetadata ? metadata : undefined, ) } else { warnings.push({ type: "citavi_missing_payload", value: tagVal, }) } } const bibDB: Record = {} entries.forEach((entry, i) => { bibDB[i + 1] = entry }) const result: CitationResult = { isCitation: true, format, entries: bibDB, errors, warnings, } if (retrieveMetadata) result.metadata = metadata return result } /** * Check or extract bibliography rendering region from an SDT block. * * @param sdtXml - XML string of a single ... block * @param retrieve - If true, extract data (currently returns empty as bibliographies have no importable data) * @returns BibliographyResult indicating whether it's a bibliography */ static sdtBibliography(sdtXml: string): BibliographyResult { const tagMatch = sdtXml.match(/(), cslRawIdToEntryKey: new Map(), }, ): CitationResult { const { entries, errors, warnings, seenKeys } = acc const upper = instrText.trim().toUpperCase() let format: string | undefined if (upper.startsWith("ADDIN ZOTERO_ITEM")) { format = "zotero" } else if ( upper.startsWith("ADDIN CSL_CITATION") || upper.startsWith("CSL_CITATION") ) { format = "mendeley_legacy" } else if (upper.startsWith("ADDIN EN.CITE")) { format = "endnote" } else if (upper.startsWith("ADDIN CITAVIPLACEHOLDER")) { format = "citavi" } else if (upper.startsWith("CITATION ")) { format = "word_native" } if (!format) { return { isCitation: false } } if (!retrieve) { return { isCitation: true, format } } // Extract citation data const metadata: CitationItemMetadata[] = [] if (format === "zotero" || format === "mendeley_legacy") { const jsonStart = instrText.indexOf("{") if (jsonStart === -1) { warnings.push({ type: `${format}_missing_json` }) } else { const jsonStr = extractJsonObject(instrText, jsonStart) if (jsonStr === null) { warnings.push({ type: `${format}_missing_json` }) } else { DocxCitationsParser.extractCslJsonData( jsonStr, format, acc, retrieveMetadata ? metadata : undefined, ) } } } else if (format === "endnote") { DocxCitationsParser.extractEndNoteData( instrText, fldData, acc, retrieveMetadata ? metadata : undefined, ) } else if (format === "citavi") { const b64Match = instrText.match( /ADDIN CitaviPlaceholder\{?([A-Za-z0-9+/=\s]+)\}?/i, ) if (b64Match) { DocxCitationsParser.extractCitaviData( b64Match[1].replace(/\s/g, ""), acc, retrieveMetadata ? metadata : undefined, ) } } else if (format === "word_native") { // Record the key for later resolution const m = /^CITATION\s+(\S+)/i.exec(instrText.trim()) if (m) { seenKeys.add(m[1]) } // Only extract data if sourcesXml is provided AND extractWordNative is true. // When called from instance methods, extractWordNative is false and we // should only record keys - extraction happens later in parseSourcesXml. if (options.sourcesXml && extractWordNative) { DocxCitationsParser.extractWordNativeData( instrText, options.sourcesXml, acc, ) } } const bibDB: Record = {} entries.forEach((entry, i) => { bibDB[i + 1] = entry }) const result: CitationResult = { isCitation: true, format, entries: bibDB, errors, warnings, } if (retrieveMetadata) result.metadata = metadata return result } /** * Check or extract bibliography rendering region from a field instruction. * * @param instrText - The concatenated instruction text * @param retrieve - If true, extract data (currently returns empty as bibliographies have no importable data) * @returns BibliographyResult indicating whether it's a bibliography */ static fieldBibliography(instrText: string): BibliographyResult { const upper = instrText.trim().toUpperCase() let format: string | undefined if (upper.startsWith("ADDIN ZOTERO_BIBL")) { format = "zotero" } else if (upper.startsWith("ADDIN EN.REFLIST")) { format = "endnote" } else if (upper.startsWith("BIBLIOGRAPHY")) { format = "word_native" } if (!format) { return { isBibliography: false } } const result: BibliographyResult = { isBibliography: true, format, } return result } // ----------------------------------------------------------------------- // Static helper methods for extraction logic // ----------------------------------------------------------------------- /** * Extract CSL citation JSON data. */ private static extractCslJsonData( jsonStr: string, source: string, acc: CitationAccumulator, metadata?: CitationItemMetadata[], ): void { const { entries, errors, warnings, seenKeys } = acc let citation: { citationItems?: Array<{ itemData?: CSLEntry id?: unknown locator?: unknown label?: unknown prefix?: unknown suffix?: unknown "suppress-author"?: unknown "author-only"?: unknown }> } try { citation = JSON.parse(jsonStr) as typeof citation } catch { warnings.push({ type: `${source}_invalid_json`, value: jsonStr.slice(0, 80), }) return } const items = citation.citationItems ?? [] if (items.length === 0) return const cslRecord: Record = {} // Track the raw CSL id for each item index so we can attach metadata later const itemKeys: Array = [] items.forEach((item, i) => { if (!item.itemData) { itemKeys.push(undefined) return } const key = item.itemData.id === undefined ? `${source}_${i}` : String(item.itemData.id) // Always record the raw key; skip adding to cslRecord if already seen itemKeys.push(key) if (seenKeys.has(key)) return cslRecord[key] = item.itemData }) if (Object.keys(cslRecord).length > 0) { const parser = new CSLParser(cslRecord) const bibDB = parser.parse() errors.push(...parser.errors) warnings.push(...parser.warnings) for (const entry of Object.values(bibDB)) { seenKeys.add(entry.entry_key) entries.push(entry) } // Merge the authoritative raw-id → entry_key map from this parse // into the accumulator so future citations can resolve duplicates. for (const [rawId, entryKey] of parser.rawIdToEntryKey) { acc.cslRawIdToEntryKey.set(rawId, entryKey) } } if (metadata) { items.forEach((item, i) => { const rawKey = itemKeys[i] if (!rawKey) return // Resolve normalised entry_key via the persistent accumulator map; // fall back to rawKey only if the entry was never successfully parsed. const entry_key = acc.cslRawIdToEntryKey.get(rawKey) ?? rawKey const meta: CitationItemMetadata = { entry_key } if ( item.locator !== undefined && item.locator !== null && item.locator !== "" ) meta.locator = String(item.locator) if ( item.label !== undefined && item.label !== null && item.label !== "" ) meta.label = String(item.label) if ( item.prefix !== undefined && item.prefix !== null && item.prefix !== "" ) meta.prefix = String(item.prefix) if ( item.suffix !== undefined && item.suffix !== null && item.suffix !== "" ) meta.suffix = String(item.suffix) if (item["suppress-author"]) meta.suppressAuthor = true if (item["author-only"]) meta.authorOnly = true metadata.push(meta) }) } } /** * Extract EndNote citation data. */ private static extractEndNoteData( instrText: string, fldData: string | undefined, acc: CitationAccumulator, metadata?: CitationItemMetadata[], ): void { const { warnings } = acc let xmlPayload = "" if (fldData && fldData.length > 0) { try { xmlPayload = DocxCitationsParser.decodeBase64Static(fldData) } catch { warnings.push({ type: "endnote_invalid_flddata", value: fldData.slice(0, 40), }) return } } else { const idx = instrText.toUpperCase().indexOf("ADDIN EN.CITE") if (idx === -1) return xmlPayload = DocxCitationsParser.unescapeXmlEntitiesStatic( instrText.slice(idx + "ADDIN EN.CITE".length).trim(), ) } if (xmlPayload.includes(" } const hasEmbeddedReferences = !Array.isArray(payload) && Array.isArray(typedPayload.Entries) && typedPayload.Entries?.some( (e) => e.Reference !== null && e.Reference !== undefined, ) if (!hasEmbeddedReferences) { warnings.push({ type: "citavi_missing_embedded_references", value: b64.slice(0, 40), }) return } const parser = new CitaviParser(payload) const bibDB = parser.parse() errors.push(...parser.errors) warnings.push(...parser.warnings) for (const entry of Object.values(bibDB)) { if (!seenKeys.has(entry.entry_key)) { seenKeys.add(entry.entry_key) entries.push(entry) } } if (metadata && typedPayload.Entries) { // Use the authoritative ReferenceId → entry_key map that CitaviParser // builds during parse(). This is reliable even when the parser // synthesises a lastname+year key that bears no resemblance to the // original UUID. for (const citaviEntry of typedPayload.Entries) { const refId = citaviEntry.ReferenceId if (!refId) continue const entry_key = parser.referenceIdToEntryKey.get(refId) ?? refId const meta: CitationItemMetadata = { entry_key } if (citaviEntry.Prefix) meta.prefix = citaviEntry.Prefix if (citaviEntry.Suffix) meta.suffix = citaviEntry.Suffix const pageRange = citaviEntry.PageRange const pageStr = pageRange?.OriginalString if (pageStr !== undefined && pageStr !== null && pageStr !== "") meta.locator = pageStr // Derive a CSL-style locator label from NumberingType. // The integer-to-type mapping is inferred from the Citavi manual's // prose (types listed in order) and has NOT been confirmed against // observed data beyond value 0 (Pages). if ( pageRange && !citaviEntry.UseNumberingTypeOfParentDocument ) { const nt = pageRange.NumberingType if (nt !== undefined && nt !== null && nt !== 0) { const numberingTypeLabels: Record = { 1: "column", // Columns (Col.) — inferred 2: "section", // Section numbers (Nr./§) — inferred 3: "note", // Margin numbers — inferred 4: "custom", // Other / free-form — inferred } const label = numberingTypeLabels[nt] if (label !== undefined) meta.label = label } } if (citaviEntry.BibliographyEntry) meta.bibliographyEntry = citaviEntry.BibliographyEntry if ( citaviEntry.RuleSet !== undefined && citaviEntry.RuleSet !== null ) meta.ruleSet = citaviEntry.RuleSet if ( citaviEntry.FormatOption !== undefined && citaviEntry.FormatOption !== null ) meta.formatOption = citaviEntry.FormatOption if ( citaviEntry.InsertAs !== undefined && citaviEntry.InsertAs !== null ) meta.insertAs = citaviEntry.InsertAs metadata.push(meta) } } } /** * Extract Word native citation data. */ private static extractWordNativeData( instrText: string, sourcesXml: string, acc: CitationAccumulator, ): void { const { entries, errors, warnings, seenKeys } = acc const m = /^CITATION\s+(\S+)/i.exec(instrText.trim()) if (m) { const citationKey = m[1] seenKeys.add(citationKey) // citedKeys = the key we just recorded (the allowlist for this call). // importedKeys = keys already pushed into `entries` so we don't // duplicate them across multiple CITATION fields in the same document. const citedKeys = new Set([citationKey]) const importedKeys = new Set( entries.map((e) => e.entry_key), ) const nativeParser = new DocxNativeParser(sourcesXml) const result = nativeParser.parse(citedKeys, importedKeys) errors.push(...result.errors) warnings.push(...result.warnings) entries.push(...result.entries) } } /** * Parse EndNote XML payload. */ private static parseEndNoteXml( xml: string, acc: CitationAccumulator, metadata?: CitationItemMetadata[], ): void { const { entries, errors, warnings, seenKeys } = acc const records: EndNoteRecord[] = [] // When collecting metadata, capture per-Cite fields before deduplication interface CiteFields { recNum: string prefix?: string suffix?: string pages?: string authorYear?: boolean } const citeFieldsList: CiteFields[] = [] // Match the full opening tag (which may carry AuthorYear="1") plus body const citeRe = /]*)?>([\s\S]*?)<\/Cite>/g let citeMatch: RegExpExecArray | null while ((citeMatch = citeRe.exec(xml)) !== null) { const citeAttrs = citeMatch[1] ?? "" const citeXml = citeMatch[2] const recordMatch = /([\s\S]*?)<\/record>/.exec(citeXml) if (recordMatch) { const record = DocxCitationsParser.parseEndNoteRecordXml( recordMatch[0], ) const key = String(record["rec-number"] ?? "") if (key && !seenKeys.has(key)) { records.push(record) seenKeys.add(key) } if (metadata && key) { // AuthorYear="1" on the opening tag means show author outside parens const authorYear = /AuthorYear\s*=\s*["']?1["']?/i.test( citeAttrs, ) // Only search for citation-level fields (Prefix, Suffix, // Pages) in the portion of that comes *before* the // embedded element. The block contains // the reference's own field, and matching against // the full citeXml would confuse the reference page range // with a per-citation locator. const recordStart = citeXml.indexOf("") const citeHeader = recordStart === -1 ? citeXml : citeXml.slice(0, recordStart) const prefixMatch = /]*>([\s\S]*?)<\/Prefix>/i.exec(citeHeader) const suffixMatch = /]*>([\s\S]*?)<\/Suffix>/i.exec(citeHeader) const pagesMatch = /]*>([\s\S]*?)<\/Pages>/i.exec( citeHeader, ) citeFieldsList.push({ recNum: key, prefix: prefixMatch ? DocxCitationsParser.stripStyleTagsStatic( prefixMatch[1], ) : undefined, suffix: suffixMatch ? DocxCitationsParser.stripStyleTagsStatic( suffixMatch[1], ) : undefined, pages: pagesMatch ? DocxCitationsParser.stripStyleTagsStatic( pagesMatch[1], ) : undefined, authorYear, }) } } } if (records.length === 0) return const parser = new EndNoteParser(records) const result = parser.parse() errors.push(...result.errors) warnings.push(...result.warnings) entries.push(...Object.values(result.entries)) if (metadata) { // Use the authoritative rec-number → entry_key map that EndNoteParser // builds during parse(). This is reliable even when the parser // synthesises a lastname+year key that bears no resemblance to the // original rec-number. for (const cite of citeFieldsList) { const entry_key = parser.recNumberToEntryKey.get(cite.recNum) ?? cite.recNum const meta: CitationItemMetadata = { entry_key } if (cite.prefix) meta.prefix = cite.prefix if (cite.suffix) meta.suffix = cite.suffix if (cite.pages) meta.locator = cite.pages if (cite.authorYear) meta.authorYear = true metadata.push(meta) } } } /** * Parse EndNote record XML. */ private static parseEndNoteRecordXml(recordXml: string): EndNoteRecord { const record: EndNoteRecord = {} const refTypeMatch = recordXml.match( /]*>(\d+)<\/ref-type>/, ) if (refTypeMatch) { record["ref-type"] = { name: refTypeMatch[1] ?? "", "#text": refTypeMatch[2], } } const recNumMatch = recordXml.match( /]*>([\s\S]*?)<\/rec-number>/, ) if (recNumMatch) { record["rec-number"] = recNumMatch[1].trim() } const titlesMatch = recordXml.match(/([\s\S]*?)<\/titles>/) if (titlesMatch) { const t = titlesMatch[1] const titles: Record = {} for (const tag of [ "title", "secondary-title", "tertiary-title", "short-title", "alt-title", "translated-title", ] as const) { const m = t.match( new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`), ) if (m) titles[tag] = { "#text": DocxCitationsParser.stripStyleTagsStatic(m[1]), } } if (Object.keys(titles).length > 0) record.titles = titles } const contribMatch = recordXml.match( /([\s\S]*?)<\/contributors>/, ) if (contribMatch) { record.contributors = DocxCitationsParser.parseContributorsXml( contribMatch[1], ) } const periodicalMatch = recordXml.match( /([\s\S]*?)<\/periodical>/, ) if (periodicalMatch) { const p = periodicalMatch[1] const periodical: Record = {} for (const tag of [ "full-title", "abbr-1", "abbr-2", "abbr-3", ] as const) { const m = p.match( new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`), ) if (m) periodical[tag] = { "#text": DocxCitationsParser.stripStyleTagsStatic(m[1]), } } if (Object.keys(periodical).length > 0) record.periodical = periodical } const scalarFields: Array<[keyof EndNoteRecord, string]> = [ ["pages", "pages"], ["volume", "volume"], ["number", "number"], ["issue", "issue"], ["edition", "edition"], ["section", "section"], ["publisher", "publisher"], ["isbn", "isbn"], ["issn", "issn"], ["abstract", "abstract"], ["notes", "notes"], ["language", "language"], ["label", "label"], ["doi", "doi"], ["electronic-resource-num", "electronic-resource-num"], ] for (const [recordKey, xmlTag] of scalarFields) { const m = recordXml.match( new RegExp(`<${xmlTag}[^>]*>([\\s\\S]*?)<\\/${xmlTag}>`), ) if (m) { // eslint-disable-next-line @typescript-eslint/no-extra-semi ;(record as Record)[recordKey as string] = { "#text": DocxCitationsParser.stripStyleTagsStatic(m[1]), } } } const pubLocMatch = recordXml.match( /]*>([\s\S]*?)<\/pub-location>/, ) if (pubLocMatch) { record["pub-location"] = { "#text": DocxCitationsParser.stripStyleTagsStatic( pubLocMatch[1], ), } } const datesMatch = recordXml.match(/([\s\S]*?)<\/dates>/) if (datesMatch) { record.dates = DocxCitationsParser.parseDatesXml(datesMatch[1]) } const keywordsMatch = recordXml.match( /([\s\S]*?)<\/keywords>/, ) if (keywordsMatch) { const kwMatches = [ ...keywordsMatch[1].matchAll( /]*>([\s\S]*?)<\/keyword>/g, ), ] if (kwMatches.length > 0) { record.keywords = { keyword: kwMatches.map((kw) => ({ "#text": DocxCitationsParser.stripStyleTagsStatic( kw[1], ), })), } } } const urlsMatch = recordXml.match(/([\s\S]*?)<\/urls>/) if (urlsMatch) { record.urls = DocxCitationsParser.parseUrlsXml(urlsMatch[1]) } return record } /** * Parse contributors XML. */ private static parseContributorsXml( xml: string, ): Record }> { const result: Record }> = {} for (const group of [ "authors", "secondary-authors", "tertiary-authors", "subsidiary-authors", ] as const) { const m = xml.match( new RegExp(`<${group}[^>]*>([\\s\\S]*?)<\\/${group}>`), ) if (m) { const authorMatches = [ ...m[1].matchAll(/]*>([\s\S]*?)<\/author>/g), ] if (authorMatches.length > 0) { result[group] = { author: authorMatches.map((author) => ({ "#text": DocxCitationsParser.stripStyleTagsStatic( author[1], ), })), } } } } return result } /** * Parse dates XML. */ private static parseDatesXml(xml: string): Record { const dates: Record = {} const yearMatch = xml.match(/]*>([\s\S]*?)<\/year>/) if (yearMatch) { dates.year = { "#text": DocxCitationsParser.stripStyleTagsStatic(yearMatch[1]), } } const pubDatesMatch = xml.match(/([\s\S]*?)<\/pub-dates>/) if (pubDatesMatch) { const dateMatches = [ ...pubDatesMatch[1].matchAll(/]*>([\s\S]*?)<\/date>/g), ] if (dateMatches.length > 0) { dates["pub-dates"] = { date: dateMatches.map((d) => ({ "#text": DocxCitationsParser.stripStyleTagsStatic(d[1]), })), } } } return dates } /** * Parse URLs XML. */ private static parseUrlsXml(xml: string): Record { const urls: Record = {} for (const group of [ "web-urls", "pdf-urls", "related-urls", "text-urls", "image-urls", ] as const) { const m = xml.match( new RegExp(`<${group}>([\\s\\S]*?)<\\/${group}>`), ) if (!m) continue const urlMatches = [ ...m[1].matchAll(/]*>([\s\S]*?)<\/url>/g), ] if (urlMatches.length > 0) { urls[group] = { url: urlMatches.map((u) => ({ "#text": DocxCitationsParser.stripStyleTagsStatic(u[1]), })), } } } return urls } /** * Strip style tags and decode XML entities. */ private static stripStyleTagsStatic(text: string): string { return text .replace(/]*>([\s\S]*?)<\/style>/g, "$1") .replace(/<[^>]+>/g, "") .replace(/</g, "<") .replace(/>/g, ">") .replace(/&/g, "&") .replace(/"/g, '"') .replace(/'/g, "'") .trim() } /** * Unescape XML entities. */ private static unescapeXmlEntitiesStatic(text: string): string { return text .replace(/</g, "<") .replace(/>/g, ">") .replace(/&/g, "&") .replace(/"/g, '"') .replace(/'/g, "'") } /** * Decode base64. */ private static decodeBase64Static(b64: string): string { const binary = atob(b64) const bytes = new Uint8Array(binary.length) for (let i = 0; i < binary.length; i++) { bytes[i] = binary.charCodeAt(i) } let end = bytes.length while (end > 0 && bytes[end - 1] === 0) { end-- } return new TextDecoder("utf-8").decode(bytes.subarray(0, end)) } // ----------------------------------------------------------------------- // Public API // ----------------------------------------------------------------------- parse(): DocxCitationsParseResult { // 1) Parse SDT blocks (Mendeley v3, Citavi with embedded references). this.parseSdtBlocks() // 2) Parse field codes (Zotero, legacy Mendeley, Word native, etc.). this.parseFieldCodes() // 3) Parse sources XML if provided (Word native). if (this.options.sourcesXml) { this.parseSourcesXml(this.options.sourcesXml) } // Build final BibDB const bibDB: Record = {} this.entries.forEach((entry, i) => { bibDB[i + 1] = entry }) return { entries: bibDB, errors: this.errors, warnings: this.warnings, } } // ----------------------------------------------------------------------- // Step 1 — structured document tags // ----------------------------------------------------------------------- private parseSdtBlocks(): void { const sdtRe = /]*>([\s\S]*?)<\/w:sdt>/g let m: RegExpExecArray | null while ((m = sdtRe.exec(this.documentXml)) !== null) { DocxCitationsParser.sdtCitation(m[1], true, false, { entries: this.entries, errors: this.errors, warnings: this.warnings, seenKeys: this.seenKeys, cslRawIdToEntryKey: this.cslRawIdToEntryKey, }) } } // ----------------------------------------------------------------------- // Step 2 — field codes (begin → separate) // ----------------------------------------------------------------------- /** * Tokenises the document XML into field-code events, replays them with a * depth counter to handle nested fields (EN.CITE wraps EN.CITE.DATA), and * dispatches each completed instruction to the correct handler. * * Per the DOCX spec (and documented in CITATIONS_IN_DOCS.md), all * elements between `begin` and `separate` must be * concatenated before the instruction is interpreted. */ private parseFieldCodes(): void { type Token = | { kind: "begin"; fldData?: string } | { kind: "separate" } | { kind: "end" } | { kind: "instr"; text: string } | { kind: "flddata"; data: string } const tokens: Token[] = [] // Matches, in order of priority: // 1. fldChar begin (optionally contains a child element // when the fldData is nested inside the fldChar tag itself) // 2. fldChar separate // 3. fldChar end // 4. instrText // 5. standalone element (the common real-world form where // EndNote places the base64 payload in its own run as a // sibling of , not nested inside it) const tokenRe = /]*\/?>(?:]*>([\s\S]*?)<\/w:fldData>(?:<\/w:fldChar>))?|]*\/?>|]*\/?>|]*>([\s\S]*?)<\/w:instrText>|]*>([\s\S]*?)<\/w:fldData>/g let t: RegExpExecArray | null while ((t = tokenRe.exec(this.documentXml)) !== null) { const full = t[0] if (full.includes('fldCharType="begin"')) { const raw = t[1] ?? "" tokens.push({ kind: "begin", fldData: raw ? raw.replace(/\s/g, "") : undefined, }) } else if (full.includes('fldCharType="separate"')) { tokens.push({ kind: "separate" }) } else if (full.includes('fldCharType="end"')) { tokens.push({ kind: "end" }) } else if (full.startsWith(" element (sibling run, not nested in // ). Capture group index 3 (the fifth alternation). const raw = t[3] ?? "" tokens.push({ kind: "flddata", data: raw.replace(/\s/g, ""), }) } else { tokens.push({ kind: "instr", text: t[2] ?? "" }) } } // Replay with a stack so nested fields are handled correctly const stack: Array<{ instrParts: string[] fldData: string | undefined pastSeparate: boolean }> = [] for (const token of tokens) { if (token.kind === "begin") { stack.push({ instrParts: [], fldData: token.fldData, pastSeparate: false, }) } else if (token.kind === "flddata") { // Attach sibling fldData to the innermost open field frame, // but only if that frame does not already have fldData (the // nested-child form takes precedence). if (stack.length > 0 && !stack[stack.length - 1].fldData) { stack[stack.length - 1].fldData = token.data } } else if (token.kind === "separate") { if (stack.length > 0) { stack[stack.length - 1].pastSeparate = true } } else if (token.kind === "end") { if (stack.length === 0) continue const frame = stack.pop()! const instr = frame.instrParts.join("").trim() DocxCitationsParser.fieldCitation( instr, true, false, // Don't retrieve metadata here; false, // Don't extract Word-native here; defer to parseSourcesXml frame.fldData, this.options, { entries: this.entries, errors: this.errors, warnings: this.warnings, seenKeys: this.seenKeys, cslRawIdToEntryKey: this.cslRawIdToEntryKey, }, ) } else if ( stack.length > 0 && !stack[stack.length - 1].pastSeparate ) { stack[stack.length - 1].instrParts.push(token.text) } } } // ----------------------------------------------------------------------- // Format handlers — each delegates to an existing parser // ----------------------------------------------------------------------- // --- Zotero DOCX --- // --- Word native sources XML --- /** * Delegates to DocxNativeParser, passing `seenKeys` so that sources * already imported via other field types are not duplicated, and so that * newly imported keys are recorded for future deduplication. */ private parseSourcesXml(xml: string): void { // this.seenKeys contains every citation key recorded from CITATION // field instructions — use it as the cited-keys allowlist so that only // sources actually referenced in the document are imported. // A fresh importedKeys set is used for deduplication within this call; // at this point this.entries is still empty for the Word-native path so // there is nothing to pre-populate it with. const importedKeys = new Set( this.entries.map((e) => e.entry_key), ) const nativeParser = new DocxNativeParser(xml) const result = nativeParser.parse(this.seenKeys, importedKeys) this.errors.push(...result.errors) this.warnings.push(...result.warnings) this.entries.push(...result.entries) } } // --------------------------------------------------------------------------- // Convenience function // --------------------------------------------------------------------------- export function parseDocxCitations( documentXml: string, options: DocxCitationsParserOptions = {}, ): DocxCitationsParseResult { return new DocxCitationsParser(documentXml, options).parse() }