UNPKG

3.72 kBJavaScriptView Raw
1'use strict';
2
3const cheerio = require('cheerio');
4const got = require('got');
5
6const PROXY_URL = 'http://api.proxiesapi.com';
7const SCHOLAR_BASE_URL = 'https://scholar.google.com';
8
9class ProxyError extends Error {
10 constructor(msg) {
11 super(msg);
12 this.name = 'ProxyError';
13 }
14}
15
16const isHTTPError = (err, { statusCode }) => {
17 return err instanceof got.HTTPError &&
18 err.response.statusCode === statusCode;
19};
20
21const client = got.extend({
22 mutableDefaults: true,
23 handlers: [
24 (options, next) => {
25 if (options.isStream) return next(options);
26 return next(options).catch(error => {
27 if (isHTTPError(error, { statusCode: 401 })) {
28 throw new ProxyError('Api key invalid or expired');
29 }
30 throw error;
31 });
32 }
33 ]
34});
35
36const selectors = {
37 pub: {
38 container: '.gs_ri',
39 title: '.gs_rt',
40 authors: '.gs_a a'
41 },
42 author: {
43 container: '#gsc_prf',
44 name: '#gsc_prf_in',
45 affiliation: '#gsc_prf_in + .gsc_prf_il',
46 domain: '#gsc_prf_ivh',
47 homepage: '#gsc_prf_ivh a',
48 interests: '#gsc_prf_int a',
49 metrics: '#gsc_rsb_st tr:nth-of-type(2) .gsc_rsb_std'
50 }
51};
52
53class Scholar {
54 init(key) {
55 if (key) {
56 this.isProxy = true;
57 client.defaults.options = got.mergeOptions(client.defaults.options, {
58 prefixUrl: PROXY_URL,
59 searchParams: { auth_key: key }
60 });
61 }
62 return this;
63 }
64
65 request(url) {
66 url = url.href || url;
67 const searchParams = { url };
68 return this.isProxy ? client.get({ searchParams }) : client.get(url);
69 }
70
71 async searchPub(query) {
72 const url = new URL('scholar', SCHOLAR_BASE_URL);
73 url.searchParams.set('q', query);
74 const result = await this.request(url);
75 return this.parsePub(result.body);
76 }
77
78 async getAuthorProfile(link) {
79 const url = new URL(link, SCHOLAR_BASE_URL);
80 const result = await this.request(url);
81 return this.parseAuthorProfile(result.body);
82 }
83
84 async getPubAuthors(query) {
85 const { authors } = await this.searchPub(query);
86 if (!authors) {
87 return;
88 }
89 return Promise.all(authors.map(async ({ id, url }) => {
90 const profile = await this.getAuthorProfile(url);
91 return { id, ...profile };
92 }));
93 }
94
95 parsePub(html) {
96 const { pub } = selectors;
97 const $ = cheerio.load(html);
98 const $publicationContainer = $(pub.container).first();
99
100 const $authors = $publicationContainer.find(pub.authors);
101 const authors = $authors.map((_, el) => {
102 const $el = $(el);
103 const name = $el.text();
104 const url = new URL($el.attr('href'), SCHOLAR_BASE_URL);
105 const id = url.searchParams.get('user');
106 return { id, name, url: url.href };
107 }).get();
108
109 const title = $publicationContainer.find(pub.title).text();
110 return { title, authors };
111 }
112
113 parseAuthorProfile(html) {
114 const { author } = selectors;
115 const $ = cheerio.load(html);
116 const $profileContainer = $(author.container);
117
118 const name = $profileContainer.find(author.name).text();
119 const affiliation = $profileContainer.find(author.affiliation).text();
120 const homepage = $profileContainer.find(author.homepage).attr('href');
121
122 const $domain = $profileContainer.find(author.domain);
123 const emailInfo = $domain[0].childNodes[0].data.trim();
124 const domain = emailInfo.split(/\s+/g).filter(token => token !== '-').pop();
125
126 const $interests = $profileContainer.find(author.interests);
127 const interests = $interests.map((_, el) => $(el).text()).get();
128
129 const hindex = $(author.metrics).first().text();
130
131 return { name, affiliation, homepage, domain, hindex, interests };
132 }
133}
134
135module.exports = new Scholar();
136module.exports.ProxyError = ProxyError;