UNPKG

@au5ton/use-atom-feed/src/Parser.ts

Version:

6.89 kBPlain TextView Raw

1import { AtomCategory, AtomContent, AtomLink, AtomLinkRelType, AtomPerson, AtomText, AtomTextType } from './AtomCommon';
2import { AtomEntry, AtomSource } from './AtomEntry';
3import { AtomFeed } from './AtomFeed';
4
5import { sanitize } from 'dompurify';
6
7/** searches for a tag in the node list, prevents recursive searches */
8const findByTag = (nodes: Iterable<Element> | ArrayLike<Element> | HTMLCollection, tagName: string) => Array.from(nodes).find(e => e.nodeName === tagName);
9/** searches for nodes which have a matching tagName, prevents recursive searches */
10const filterByTag = (nodes: Iterable<Element> | ArrayLike<Element> | HTMLCollection, tagName: string) => Array.from(nodes).filter(e => e.nodeName === tagName);
11/** shortcut method for `findByTag()` that accesses children */
12const findChildTag = (parent: Document | Element, tagName: string) => findByTag(parent.children, tagName);
13/** shortcut method for `filterByTag()` that accesses children */
14const filterChildTags = (parent: Document | Element, tagName: string) => filterByTag(parent.children, tagName);
15
16/** parses the feed */
17export function parseAtomFeed(data: string): AtomFeed {
18  const parser = new DOMParser();
19  const xml = parser.parseFromString(data, 'text/xml');
20  const feed = findChildTag(xml, 'feed');
21  if(feed) {
22    return {
23      id: sanitizeTextContent(findChildTag(feed, 'id')) ?? '',
24      title: parseAtomText(findChildTag(feed, 'title')),
25      updated: new Date(findChildTag(feed, 'updated')?.textContent ?? 0),
26      entries: filterChildTags(feed, 'entry').map(e => parseAtomEntry(e)),
27      author: filterChildTags(feed, 'author').map(author => parseAtomPerson(author)),
28      link: filterChildTags(feed, 'link').map(link => parseAtomLink(link)),
29      category: filterChildTags(feed, 'category').map(category => parseAtomCategory(category)),
30      contributor: filterChildTags(feed, 'contributor').map(contributor => parseAtomPerson(contributor)),
31      generator: {
32        value: sanitizeTextContent(findChildTag(feed, 'generator')) ?? '',
33        uri: sanitizeTextAttribute(findChildTag(feed, 'generator'), 'uri'),
34        version: sanitizeTextAttribute(findChildTag(feed, 'generator'), 'version'),
35      },
36      icon: sanitizeTextContent(findChildTag(feed, 'icon')),
37      logo: sanitizeTextContent(findChildTag(feed, 'logo')),
38      rights: parseAtomText(findChildTag(feed, 'rights')),
39      subtitle: sanitizeTextContent(findChildTag(feed, 'subtitle')),
40    };
41  }
42  throw Error('No <feed> tag found.');
43}
44
45export function parseAtomEntry(entry: Element): AtomEntry {
46  return {
47    id: sanitizeTextContent(findChildTag(entry, 'id')) ?? '',
48    title: parseAtomText(findChildTag(entry, 'title')),
49    updated: new Date(findChildTag(entry, 'updated')?.textContent ?? 0),
50    author: filterChildTags(entry, 'author').map(author => parseAtomPerson(author)),
51    content: parseAtomContent(findChildTag(entry, 'content')),
52    link: filterChildTags(entry, 'link').map(link => parseAtomLink(link)),
53    summary: parseAtomText(findChildTag(entry, 'summary')),
54    category: filterChildTags(entry, 'category').map(category => parseAtomCategory(category)),
55    contributor: filterChildTags(entry, 'contributor').map(contributor => parseAtomPerson(contributor)),
56    published: findChildTag(entry, 'published') ? new Date(findChildTag(entry, 'published')?.textContent ?? 0) : undefined,
57    rights: parseAtomText(findChildTag(entry, 'rights')),
58    source: parseAtomSource(findChildTag(entry, 'source')),
59  };
60}
61
62/** safely decode text content */
63export function safelyDecodeAtomText(type: AtomTextType, element: Element | undefined): string {
64  if(element !== undefined) {
65    // If type="xhtml", then this element contains inline xhtml, wrapped in a div element.
66    // This means that the existing `.innerHTML` is ready to be santized
67    if(type === 'xhtml') return sanitize(element.innerHTML);
68    // If type="html", then this element contains entity escaped html.
69    // using `.textContent` will un-escape the text
70    else if(type === 'html') return sanitize(element.textContent ?? '');
71    // If type="text", then this element contains plain text with no entity escaped html.
72    // This means that the content of `.innerHTML` are **intended** to be safe.
73    // However, we don't want to leave an attack vector open, so we're going to sanitize it anyway.
74    else if(type === 'text') return sanitize(element.innerHTML);
75  }
76  return '';
77}
78
79/** shortcut for safely decoding the `.textContent` value of an element */
80export function sanitizeTextContent(element: Element | undefined): string | undefined { 
81  return element !== undefined ? sanitize(element?.textContent ?? '') : undefined;
82}
83
84/** shortcut for safely decoding the an attribute value of an element */
85export function sanitizeTextAttribute<T = string>(element: Element | undefined, attributeName: string): T | undefined  {
86  return element !== undefined ? (element.getAttribute(attributeName) !== null ? sanitize(element.getAttribute(attributeName)!) as unknown as T : undefined) : undefined;
87}
88
89export function parseAtomContent(content: Element | undefined): AtomContent {
90  const type = (sanitizeTextAttribute(content, 'type') as AtomTextType) ?? undefined;
91  return {
92    type,
93    src: sanitizeTextAttribute(content, 'src'),
94    value: safelyDecodeAtomText(type, content),
95  }
96}
97
98
99export function parseAtomText(text: Element | undefined): AtomText {
100  const type = (sanitizeTextAttribute(text, 'type') as AtomTextType) ?? undefined;
101  return {
102    type,
103    value: safelyDecodeAtomText(type, text)
104  }
105}
106
107export function parseAtomPerson(person: Element): AtomPerson {
108  return {
109    name: sanitize(findChildTag(person, 'name')?.textContent ?? ''),
110    uri: sanitizeTextContent(findChildTag(person, 'uri')),
111    email: sanitizeTextContent(findChildTag(person, 'email')),
112  }
113}
114
115export function parseAtomLink(link: Element): AtomLink {
116  return {
117    href: sanitizeTextAttribute(link, 'href') ?? '',
118    rel: sanitizeTextAttribute<AtomLinkRelType>(link, 'ref'),
119    type: sanitizeTextAttribute(link, 'type'),
120    hreflang: sanitizeTextAttribute(link, 'hreflang'),
121    title: sanitizeTextAttribute(link, 'title'),
122    length: sanitizeTextAttribute(link, 'length'),
123  };
124}
125
126export function parseAtomCategory(category: Element): AtomCategory {
127  return {
128    term: sanitizeTextAttribute(category, 'term') ?? '',
129    scheme: sanitizeTextAttribute(category, 'scheme') ?? undefined,
130    label: sanitizeTextAttribute(category, 'label') ?? undefined
131  };
132}
133
134export function parseAtomSource(source: Element | undefined): AtomSource | undefined {
135  if(source !== undefined) {
136    return {
137      id: sanitizeTextContent(findChildTag(source, 'id')) ?? '',
138      title: sanitizeTextContent(findChildTag(source, 'title')) ?? '',
139      updated: new Date(findChildTag(source, 'title')?.textContent ?? 0)
140    };
141  }
142  return undefined;
143}
\No newline at end of file

1	`import { AtomCategory, AtomContent, AtomLink, AtomLinkRelType, AtomPerson, AtomText, AtomTextType } from './AtomCommon';`
2	`import { AtomEntry, AtomSource } from './AtomEntry';`
3	`import { AtomFeed } from './AtomFeed';`
4
5	`import { sanitize } from 'dompurify';`
6
7	`/** searches for a tag in the node list, prevents recursive searches */`
8	`const findByTag = (nodes: Iterable<Element> \| ArrayLike<Element> \| HTMLCollection, tagName: string) => Array.from(nodes).find(e => e.nodeName === tagName);`
9	`/** searches for nodes which have a matching tagName, prevents recursive searches */`
10	`const filterByTag = (nodes: Iterable<Element> \| ArrayLike<Element> \| HTMLCollection, tagName: string) => Array.from(nodes).filter(e => e.nodeName === tagName);`
11	/** shortcut method for `findByTag()` that accesses children */
12	`const findChildTag = (parent: Document \| Element, tagName: string) => findByTag(parent.children, tagName);`
13	/** shortcut method for `filterByTag()` that accesses children */
14	`const filterChildTags = (parent: Document \| Element, tagName: string) => filterByTag(parent.children, tagName);`
15
16	`/** parses the feed */`
17	`export function parseAtomFeed(data: string): AtomFeed {`
18	`const parser = new DOMParser();`
19	`const xml = parser.parseFromString(data, 'text/xml');`
20	`const feed = findChildTag(xml, 'feed');`
21	`if(feed) {`
22	`return {`
23	`id: sanitizeTextContent(findChildTag(feed, 'id')) ?? '',`
24	`title: parseAtomText(findChildTag(feed, 'title')),`
25	`updated: new Date(findChildTag(feed, 'updated')?.textContent ?? 0),`
26	`entries: filterChildTags(feed, 'entry').map(e => parseAtomEntry(e)),`
27	`author: filterChildTags(feed, 'author').map(author => parseAtomPerson(author)),`
28	`link: filterChildTags(feed, 'link').map(link => parseAtomLink(link)),`
29	`category: filterChildTags(feed, 'category').map(category => parseAtomCategory(category)),`
30	`contributor: filterChildTags(feed, 'contributor').map(contributor => parseAtomPerson(contributor)),`
31	`generator: {`
32	`value: sanitizeTextContent(findChildTag(feed, 'generator')) ?? '',`
33	`uri: sanitizeTextAttribute(findChildTag(feed, 'generator'), 'uri'),`
34	`version: sanitizeTextAttribute(findChildTag(feed, 'generator'), 'version'),`
35	`},`
36	`icon: sanitizeTextContent(findChildTag(feed, 'icon')),`
37	`logo: sanitizeTextContent(findChildTag(feed, 'logo')),`
38	`rights: parseAtomText(findChildTag(feed, 'rights')),`
39	`subtitle: sanitizeTextContent(findChildTag(feed, 'subtitle')),`
40	`};`
41	`}`
42	`throw Error('No <feed> tag found.');`
43	`}`
44
45	`export function parseAtomEntry(entry: Element): AtomEntry {`
46	`return {`
47	`id: sanitizeTextContent(findChildTag(entry, 'id')) ?? '',`
48	`title: parseAtomText(findChildTag(entry, 'title')),`
49	`updated: new Date(findChildTag(entry, 'updated')?.textContent ?? 0),`
50	`author: filterChildTags(entry, 'author').map(author => parseAtomPerson(author)),`
51	`content: parseAtomContent(findChildTag(entry, 'content')),`
52	`link: filterChildTags(entry, 'link').map(link => parseAtomLink(link)),`
53	`summary: parseAtomText(findChildTag(entry, 'summary')),`
54	`category: filterChildTags(entry, 'category').map(category => parseAtomCategory(category)),`
55	`contributor: filterChildTags(entry, 'contributor').map(contributor => parseAtomPerson(contributor)),`
56	`published: findChildTag(entry, 'published') ? new Date(findChildTag(entry, 'published')?.textContent ?? 0) : undefined,`
57	`rights: parseAtomText(findChildTag(entry, 'rights')),`
58	`source: parseAtomSource(findChildTag(entry, 'source')),`
59	`};`
60	`}`
61
62	`/** safely decode text content */`
63	`export function safelyDecodeAtomText(type: AtomTextType, element: Element \| undefined): string {`
64	`if(element !== undefined) {`
65	`// If type="xhtml", then this element contains inline xhtml, wrapped in a div element.`
66	// This means that the existing `.innerHTML` is ready to be santized
67	`if(type === 'xhtml') return sanitize(element.innerHTML);`
68	`// If type="html", then this element contains entity escaped html.`
69	// using `.textContent` will un-escape the text
70	`else if(type === 'html') return sanitize(element.textContent ?? '');`
71	`// If type="text", then this element contains plain text with no entity escaped html.`
72	// This means that the content of `.innerHTML` are intended to be safe.
73	`// However, we don't want to leave an attack vector open, so we're going to sanitize it anyway.`
74	`else if(type === 'text') return sanitize(element.innerHTML);`
75	`}`
76	`return '';`
77	`}`
78
79	/** shortcut for safely decoding the `.textContent` value of an element */
80	`export function sanitizeTextContent(element: Element \| undefined): string \| undefined {`
81	`return element !== undefined ? sanitize(element?.textContent ?? '') : undefined;`
82	`}`
83
84	`/** shortcut for safely decoding the an attribute value of an element */`
85	`export function sanitizeTextAttribute<T = string>(element: Element \| undefined, attributeName: string): T \| undefined {`
86	`return element !== undefined ? (element.getAttribute(attributeName) !== null ? sanitize(element.getAttribute(attributeName)!) as unknown as T : undefined) : undefined;`
87	`}`
88
89	`export function parseAtomContent(content: Element \| undefined): AtomContent {`
90	`const type = (sanitizeTextAttribute(content, 'type') as AtomTextType) ?? undefined;`
91	`return {`
92	`type,`
93	`src: sanitizeTextAttribute(content, 'src'),`
94	`value: safelyDecodeAtomText(type, content),`
95	`}`
96	`}`
97
98
99	`export function parseAtomText(text: Element \| undefined): AtomText {`
100	`const type = (sanitizeTextAttribute(text, 'type') as AtomTextType) ?? undefined;`
101	`return {`
102	`type,`
103	`value: safelyDecodeAtomText(type, text)`
104	`}`
105	`}`
106
107	`export function parseAtomPerson(person: Element): AtomPerson {`
108	`return {`
109	`name: sanitize(findChildTag(person, 'name')?.textContent ?? ''),`
110	`uri: sanitizeTextContent(findChildTag(person, 'uri')),`
111	`email: sanitizeTextContent(findChildTag(person, 'email')),`
112	`}`
113	`}`
114
115	`export function parseAtomLink(link: Element): AtomLink {`
116	`return {`
117	`href: sanitizeTextAttribute(link, 'href') ?? '',`
118	`rel: sanitizeTextAttribute<AtomLinkRelType>(link, 'ref'),`
119	`type: sanitizeTextAttribute(link, 'type'),`
120	`hreflang: sanitizeTextAttribute(link, 'hreflang'),`
121	`title: sanitizeTextAttribute(link, 'title'),`
122	`length: sanitizeTextAttribute(link, 'length'),`
123	`};`
124	`}`
125
126	`export function parseAtomCategory(category: Element): AtomCategory {`
127	`return {`
128	`term: sanitizeTextAttribute(category, 'term') ?? '',`
129	`scheme: sanitizeTextAttribute(category, 'scheme') ?? undefined,`
130	`label: sanitizeTextAttribute(category, 'label') ?? undefined`
131	`};`
132	`}`
133
134	`export function parseAtomSource(source: Element \| undefined): AtomSource \| undefined {`
135	`if(source !== undefined) {`
136	`return {`
137	`id: sanitizeTextContent(findChildTag(source, 'id')) ?? '',`
138	`title: sanitizeTextContent(findChildTag(source, 'title')) ?? '',`
139	`updated: new Date(findChildTag(source, 'title')?.textContent ?? 0)`
140	`};`
141	`}`
142	`return undefined;`
143	`}`
\	No newline at end of file