1 | 'use strict';
|
2 |
|
3 | const cheerio = require('cheerio');
|
4 | const got = require('got');
|
5 |
|
6 | const PROXY_URL = 'http://api.proxiesapi.com';
|
7 | const SCHOLAR_BASE_URL = 'https://scholar.google.com';
|
8 |
|
9 | class ProxyError extends Error {
|
10 | constructor(msg) {
|
11 | super(msg);
|
12 | this.name = 'ProxyError';
|
13 | }
|
14 | }
|
15 |
|
16 | const isHTTPError = (err, { statusCode }) => {
|
17 | return err instanceof got.HTTPError &&
|
18 | err.response.statusCode === statusCode;
|
19 | };
|
20 |
|
21 | const client = got.extend({
|
22 | mutableDefaults: true,
|
23 | handlers: [
|
24 | (options, next) => {
|
25 | if (options.isStream) return next(options);
|
26 | return next(options).catch(error => {
|
27 | if (isHTTPError(error, { statusCode: 401 })) {
|
28 | throw new ProxyError('Api key invalid or expired');
|
29 | }
|
30 | throw error;
|
31 | });
|
32 | }
|
33 | ]
|
34 | });
|
35 |
|
36 | const selectors = {
|
37 | pub: {
|
38 | container: '.gs_ri',
|
39 | title: '.gs_rt',
|
40 | authors: '.gs_a a'
|
41 | },
|
42 | author: {
|
43 | container: '#gsc_prf',
|
44 | name: '#gsc_prf_in',
|
45 | affiliation: '#gsc_prf_in + .gsc_prf_il',
|
46 | domain: '#gsc_prf_ivh',
|
47 | homepage: '#gsc_prf_ivh a',
|
48 | interests: '#gsc_prf_int a',
|
49 | metrics: '#gsc_rsb_st tr:nth-of-type(2) .gsc_rsb_std'
|
50 | }
|
51 | };
|
52 |
|
53 | class Scholar {
|
54 | init(key) {
|
55 | if (key) {
|
56 | this.isProxy = true;
|
57 | client.defaults.options = got.mergeOptions(client.defaults.options, {
|
58 | prefixUrl: PROXY_URL,
|
59 | searchParams: { auth_key: key }
|
60 | });
|
61 | }
|
62 | return this;
|
63 | }
|
64 |
|
65 | request(url) {
|
66 | url = url.href || url;
|
67 | const searchParams = { url };
|
68 | return this.isProxy ? client.get({ searchParams }) : client.get(url);
|
69 | }
|
70 |
|
71 | async searchPub(query) {
|
72 | const url = new URL('scholar', SCHOLAR_BASE_URL);
|
73 | url.searchParams.set('q', query);
|
74 | const result = await this.request(url);
|
75 | return this.parsePub(result.body);
|
76 | }
|
77 |
|
78 | async getAuthorProfile(link) {
|
79 | const url = new URL(link, SCHOLAR_BASE_URL);
|
80 | const result = await this.request(url);
|
81 | return this.parseAuthorProfile(result.body);
|
82 | }
|
83 |
|
84 | async getPubAuthors(query) {
|
85 | const { authors } = await this.searchPub(query);
|
86 | if (!authors) {
|
87 | return;
|
88 | }
|
89 | return Promise.all(authors.map(async ({ id, url }) => {
|
90 | const profile = await this.getAuthorProfile(url);
|
91 | return { id, ...profile };
|
92 | }));
|
93 | }
|
94 |
|
95 | parsePub(html) {
|
96 | const { pub } = selectors;
|
97 | const $ = cheerio.load(html);
|
98 | const $publicationContainer = $(pub.container).first();
|
99 |
|
100 | const $authors = $publicationContainer.find(pub.authors);
|
101 | const authors = $authors.map((_, el) => {
|
102 | const $el = $(el);
|
103 | const name = $el.text();
|
104 | const url = new URL($el.attr('href'), SCHOLAR_BASE_URL);
|
105 | const id = url.searchParams.get('user');
|
106 | return { id, name, url: url.href };
|
107 | }).get();
|
108 |
|
109 | const title = $publicationContainer.find(pub.title).text();
|
110 | return { title, authors };
|
111 | }
|
112 |
|
113 | parseAuthorProfile(html) {
|
114 | const { author } = selectors;
|
115 | const $ = cheerio.load(html);
|
116 | const $profileContainer = $(author.container);
|
117 |
|
118 | const name = $profileContainer.find(author.name).text();
|
119 | const affiliation = $profileContainer.find(author.affiliation).text();
|
120 | const homepage = $profileContainer.find(author.homepage).attr('href');
|
121 |
|
122 | const $domain = $profileContainer.find(author.domain);
|
123 | const emailInfo = $domain[0].childNodes[0].data.trim();
|
124 | const domain = emailInfo.split(/\s+/g).filter(token => token !== '-').pop();
|
125 |
|
126 | const $interests = $profileContainer.find(author.interests);
|
127 | const interests = $interests.map((_, el) => $(el).text()).get();
|
128 |
|
129 | const hindex = $(author.metrics).first().text();
|
130 |
|
131 | return { name, affiliation, homepage, domain, hindex, interests };
|
132 | }
|
133 | }
|
134 |
|
135 | module.exports = new Scholar();
|
136 | module.exports.ProxyError = ProxyError;
|