UNPKG

7.08 kBJavaScriptView Raw
1// https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
2const DATA_URL_DEFAULT_MIME_TYPE = 'text/plain';
3const DATA_URL_DEFAULT_CHARSET = 'us-ascii';
4
5const testParameter = (name, filters) => filters.some(filter => filter instanceof RegExp ? filter.test(name) : filter === name);
6
7const normalizeDataURL = (urlString, {stripHash}) => {
8 const match = /^data:(?<type>[^,]*?),(?<data>[^#]*?)(?:#(?<hash>.*))?$/.exec(urlString);
9
10 if (!match) {
11 throw new Error(`Invalid URL: ${urlString}`);
12 }
13
14 let {type, data, hash} = match.groups;
15 const mediaType = type.split(';');
16 hash = stripHash ? '' : hash;
17
18 let isBase64 = false;
19 if (mediaType[mediaType.length - 1] === 'base64') {
20 mediaType.pop();
21 isBase64 = true;
22 }
23
24 // Lowercase MIME type
25 const mimeType = (mediaType.shift() || '').toLowerCase();
26 const attributes = mediaType
27 .map(attribute => {
28 let [key, value = ''] = attribute.split('=').map(string => string.trim());
29
30 // Lowercase `charset`
31 if (key === 'charset') {
32 value = value.toLowerCase();
33
34 if (value === DATA_URL_DEFAULT_CHARSET) {
35 return '';
36 }
37 }
38
39 return `${key}${value ? `=${value}` : ''}`;
40 })
41 .filter(Boolean);
42
43 const normalizedMediaType = [
44 ...attributes,
45 ];
46
47 if (isBase64) {
48 normalizedMediaType.push('base64');
49 }
50
51 if (normalizedMediaType.length > 0 || (mimeType && mimeType !== DATA_URL_DEFAULT_MIME_TYPE)) {
52 normalizedMediaType.unshift(mimeType);
53 }
54
55 return `data:${normalizedMediaType.join(';')},${isBase64 ? data.trim() : data}${hash ? `#${hash}` : ''}`;
56};
57
58export default function normalizeUrl(urlString, options) {
59 options = {
60 defaultProtocol: 'http:',
61 normalizeProtocol: true,
62 forceHttp: false,
63 forceHttps: false,
64 stripAuthentication: true,
65 stripHash: false,
66 stripTextFragment: true,
67 stripWWW: true,
68 removeQueryParameters: [/^utm_\w+/i],
69 removeTrailingSlash: true,
70 removeSingleSlash: true,
71 removeDirectoryIndex: false,
72 sortQueryParameters: true,
73 ...options,
74 };
75
76 urlString = urlString.trim();
77
78 // Data URL
79 if (/^data:/i.test(urlString)) {
80 return normalizeDataURL(urlString, options);
81 }
82
83 if (/^view-source:/i.test(urlString)) {
84 throw new Error('`view-source:` is not supported as it is a non-standard protocol');
85 }
86
87 const hasRelativeProtocol = urlString.startsWith('//');
88 const isRelativeUrl = !hasRelativeProtocol && /^\.*\//.test(urlString);
89
90 // Prepend protocol
91 if (!isRelativeUrl) {
92 urlString = urlString.replace(/^(?!(?:\w+:)?\/\/)|^\/\//, options.defaultProtocol);
93 }
94
95 const urlObject = new URL(urlString);
96
97 if (options.forceHttp && options.forceHttps) {
98 throw new Error('The `forceHttp` and `forceHttps` options cannot be used together');
99 }
100
101 if (options.forceHttp && urlObject.protocol === 'https:') {
102 urlObject.protocol = 'http:';
103 }
104
105 if (options.forceHttps && urlObject.protocol === 'http:') {
106 urlObject.protocol = 'https:';
107 }
108
109 // Remove auth
110 if (options.stripAuthentication) {
111 urlObject.username = '';
112 urlObject.password = '';
113 }
114
115 // Remove hash
116 if (options.stripHash) {
117 urlObject.hash = '';
118 } else if (options.stripTextFragment) {
119 urlObject.hash = urlObject.hash.replace(/#?:~:text.*?$/i, '');
120 }
121
122 // Remove duplicate slashes if not preceded by a protocol
123 // NOTE: This could be implemented using a single negative lookbehind
124 // regex, but we avoid that to maintain compatibility with older js engines
125 // which do not have support for that feature.
126 if (urlObject.pathname) {
127 // TODO: Replace everything below with `urlObject.pathname = urlObject.pathname.replace(/(?<!\b[a-z][a-z\d+\-.]{1,50}:)\/{2,}/g, '/');` when Safari supports negative lookbehind.
128
129 // Split the string by occurrences of this protocol regex, and perform
130 // duplicate-slash replacement on the strings between those occurrences
131 // (if any).
132 const protocolRegex = /\b[a-z][a-z\d+\-.]{1,50}:\/\//g;
133
134 let lastIndex = 0;
135 let result = '';
136 for (;;) {
137 const match = protocolRegex.exec(urlObject.pathname);
138 if (!match) {
139 break;
140 }
141
142 const protocol = match[0];
143 const protocolAtIndex = match.index;
144 const intermediate = urlObject.pathname.slice(lastIndex, protocolAtIndex);
145
146 result += intermediate.replace(/\/{2,}/g, '/');
147 result += protocol;
148 lastIndex = protocolAtIndex + protocol.length;
149 }
150
151 const remnant = urlObject.pathname.slice(lastIndex, urlObject.pathname.length);
152 result += remnant.replace(/\/{2,}/g, '/');
153
154 urlObject.pathname = result;
155 }
156
157 // Decode URI octets
158 if (urlObject.pathname) {
159 try {
160 urlObject.pathname = decodeURI(urlObject.pathname);
161 } catch {}
162 }
163
164 // Remove directory index
165 if (options.removeDirectoryIndex === true) {
166 options.removeDirectoryIndex = [/^index\.[a-z]+$/];
167 }
168
169 if (Array.isArray(options.removeDirectoryIndex) && options.removeDirectoryIndex.length > 0) {
170 let pathComponents = urlObject.pathname.split('/');
171 const lastComponent = pathComponents[pathComponents.length - 1];
172
173 if (testParameter(lastComponent, options.removeDirectoryIndex)) {
174 pathComponents = pathComponents.slice(0, -1);
175 urlObject.pathname = pathComponents.slice(1).join('/') + '/';
176 }
177 }
178
179 if (urlObject.hostname) {
180 // Remove trailing dot
181 urlObject.hostname = urlObject.hostname.replace(/\.$/, '');
182
183 // Remove `www.`
184 if (options.stripWWW && /^www\.(?!www\.)[a-z\-\d]{1,63}\.[a-z.\-\d]{2,63}$/.test(urlObject.hostname)) {
185 // Each label should be max 63 at length (min: 1).
186 // Source: https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_host_names
187 // Each TLD should be up to 63 characters long (min: 2).
188 // It is technically possible to have a single character TLD, but none currently exist.
189 urlObject.hostname = urlObject.hostname.replace(/^www\./, '');
190 }
191 }
192
193 // Remove query unwanted parameters
194 if (Array.isArray(options.removeQueryParameters)) {
195 for (const key of [...urlObject.searchParams.keys()]) {
196 if (testParameter(key, options.removeQueryParameters)) {
197 urlObject.searchParams.delete(key);
198 }
199 }
200 }
201
202 if (options.removeQueryParameters === true) {
203 urlObject.search = '';
204 }
205
206 // Sort query parameters
207 if (options.sortQueryParameters) {
208 urlObject.searchParams.sort();
209 }
210
211 if (options.removeTrailingSlash) {
212 urlObject.pathname = urlObject.pathname.replace(/\/$/, '');
213 }
214
215 const oldUrlString = urlString;
216
217 // Take advantage of many of the Node `url` normalizations
218 urlString = urlObject.toString();
219
220 if (!options.removeSingleSlash && urlObject.pathname === '/' && !oldUrlString.endsWith('/') && urlObject.hash === '') {
221 urlString = urlString.replace(/\/$/, '');
222 }
223
224 // Remove ending `/` unless removeSingleSlash is false
225 if ((options.removeTrailingSlash || urlObject.pathname === '/') && urlObject.hash === '' && options.removeSingleSlash) {
226 urlString = urlString.replace(/\/$/, '');
227 }
228
229 // Restore relative protocol, if applicable
230 if (hasRelativeProtocol && !options.normalizeProtocol) {
231 urlString = urlString.replace(/^http:\/\//, '//');
232 }
233
234 // Remove http/https
235 if (options.stripProtocol) {
236 urlString = urlString.replace(/^(?:https?:)?\/\//, '');
237 }
238
239 return urlString;
240}