Skip to content

Commit 41ec4bb

Browse files
Merge pull request #254 from scholarly-python-package/develop
Develop
2 parents bc0ab17 + 952fc4f commit 41ec4bb

File tree

6 files changed

+111
-19
lines changed

6 files changed

+111
-19
lines changed

scholarly/_navigator.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from .publication_parser import _SearchScholarIterator
3030
from .author_parser import AuthorParser
3131
from .publication_parser import PublicationParser
32-
from .data_types import Author
32+
from .data_types import Author, PublicationSource
3333

3434
class DOSException(Exception):
3535
"""DOS attack was detected."""
@@ -246,10 +246,11 @@ def search_publication(self, url: str,
246246
:rtype: {Publication}
247247
"""
248248
soup = self._get_soup(url)
249-
res = PublicationParser(self, soup.find_all('div', 'gs_or')[0], 'scholar')
249+
publication_parser = PublicationParser(self)
250+
pub = publication_parser.get_publication(soup.find_all('div', 'gs_or')[0], PublicationSource.PUBLICATION_SEARCH_SNIPPET)
250251
if filled:
251-
res.fill()
252-
return res
252+
pub = publication_parser.fill(pub)
253+
return pub
253254

254255
def search_publications(self, url: str) -> _SearchScholarIterator:
255256
"""Returns a Publication Generator given a url
@@ -261,21 +262,25 @@ def search_publications(self, url: str) -> _SearchScholarIterator:
261262
"""
262263
return _SearchScholarIterator(self, url)
263264

264-
def search_author_id(self, id: str, filled: bool = False) -> Author:
265+
def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0) -> Author:
265266
"""Search by author ID and return a Author object
266267
:param id: the Google Scholar id of a particular author
267268
:type url: str
268269
:param filled: If the returned Author object should be filled
269270
:type filled: bool, optional
271+
:param sortby: if the object is an author, select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
272+
:type sortby: string
273+
:param publication_limit: Select the max number of publications you want you want to fill for the author. Defaults to no limit.
274+
:type publication_limit: int
270275
:returns: an Author object
271276
:rtype: {Author}
272277
"""
273278
author_parser = AuthorParser(self)
274279
res = author_parser.get_author(id)
275280
if filled:
276-
res = author_parser.fill(res)
281+
res = author_parser.fill(res, sortby=sortby, publication_limit=publication_limit)
277282
else:
278-
res = author_parser.fill(res, sections=['basics'])
283+
res = author_parser.fill(res, sections=['basics'], sortby=sortby, publication_limit=publication_limit)
279284
return res
280285

281286
def search_organization(self, url: str, fromauthor: bool) -> list:

scholarly/_scholarly.py

Lines changed: 72 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import os
55
import copy
66
import pprint
7-
from typing import Callable
7+
from typing import Callable, List
88
from ._navigator import Navigator
99
from ._proxy_generator import ProxyGenerator
1010
from dotenv import find_dotenv, load_dotenv
@@ -14,6 +14,7 @@
1414

1515
_AUTHSEARCH = '/citations?hl=en&view_op=search_authors&mauthors={0}'
1616
_KEYWORDSEARCH = '/citations?hl=en&view_op=search_authors&mauthors=label:{0}'
17+
_KEYWORDSEARCHBASE = '/citations?hl=en&view_op=search_authors&mauthors={}'
1718
_PUBSEARCH = '/scholar?hl=en&q={0}'
1819

1920

@@ -181,7 +182,7 @@ def search_author(self, name: str):
181182
url = _AUTHSEARCH.format(requests.utils.quote(name))
182183
return self.__nav.search_authors(url)
183184

184-
def fill(self, object: dict, sections=[]) -> Author or Publication:
185+
def fill(self, object: dict, sections=[], sortby: str = "citedby", publication_limit: int = 0) -> Author or Publication:
185186
"""Fills the object according to its type.
186187
If the container type is Author it will fill the additional author fields
187188
If it is Publication it will fill it accordingly.
@@ -190,11 +191,15 @@ def fill(self, object: dict, sections=[]) -> Author or Publication:
190191
:type object: Author or Publication
191192
:param sections: the sections that the user wants filled for an Author object. This can be: ['basics', 'indices', 'counts', 'coauthors', 'publications']
192193
:type sections: list
194+
:param sortby: if the object is an author, select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
195+
:type sortby: string
196+
:param publication_limit: if the object is an author, select the max number of publications you want you want to fill for the author. Defaults to no limit.
197+
:type publication_limit: int
193198
"""
194199

195200
if object['container_type'] == "Author":
196201
author_parser = AuthorParser(self.__nav)
197-
object = author_parser.fill(object, sections)
202+
object = author_parser.fill(object, sections, sortby, publication_limit)
198203
if object is False:
199204
raise ValueError("Incorrect input")
200205
elif object['container_type'] == "Publication":
@@ -231,8 +236,12 @@ def citedby(self, object: Publication)->_SearchScholarIterator:
231236
return
232237

233238

234-
def search_author_id(self, id: str, filled: bool = False)->Author:
239+
def search_author_id(self, id: str, filled: bool = False, sortby: str = "citedby", publication_limit: int = 0)->Author:
235240
"""Search by author id and return a single Author object
241+
:param sortby: select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
242+
:type sortby: string
243+
:param publication_limit: if the object is an author, select the max number of publications you want you want to fill for the author. Defaults to no limit.
244+
:type publication_limit: int
236245
237246
:Example::
238247
@@ -252,7 +261,7 @@ def search_author_id(self, id: str, filled: bool = False)->Author:
252261
'scholar_id': 'EmD_lTEAAAAJ',
253262
'source': 'AUTHOR_PROFILE_PAGE'}
254263
"""
255-
return self.__nav.search_author_id(id, filled)
264+
return self.__nav.search_author_id(id, filled, sortby, publication_limit)
256265

257266
def search_keyword(self, keyword: str):
258267
"""Search by keyword and return a generator of Author objects
@@ -287,6 +296,45 @@ def search_keyword(self, keyword: str):
287296
url = _KEYWORDSEARCH.format(requests.utils.quote(keyword))
288297
return self.__nav.search_authors(url)
289298

299+
def search_keywords(self, keywords: List[str]):
300+
"""Search by keywords and return a generator of Author objects
301+
302+
:param keywords: a list of keywords to be searched
303+
:type keyword: List[str]
304+
305+
:Example::
306+
307+
.. testcode::
308+
309+
search_query = scholarly.search_keywords(['crowdsourcing', 'privacy'])
310+
scholarly.pprint(next(search_query))
311+
312+
:Output::
313+
314+
.. testoutput::
315+
{'affiliation': 'Cornell University',
316+
'citedby': 40976,
317+
'email_domain': '',
318+
'filled': False,
319+
'interests': ['Crowdsourcing',
320+
'privacy',
321+
'social computing',
322+
'game theory',
323+
'user-generated content'],
324+
'name': 'Arpita Ghosh',
325+
'scholar_id': '_cMw1IUAAAAJ',
326+
'source': 'SEARCH_AUTHOR_SNIPPETS',
327+
'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=_cMw1IUAAAAJ'}
328+
329+
"""
330+
331+
formated_keywords = ['label:'+requests.utils.quote(keyword) for keyword in keywords]
332+
formated_keywords = '+'.join(formated_keywords)
333+
url = _KEYWORDSEARCHBASE.format(formated_keywords)
334+
return self.__nav.search_authors(url)
335+
336+
337+
290338
def search_pubs_custom_url(self, url: str)->_SearchScholarIterator:
291339
"""Search by custom URL and return a generator of Publication objects
292340
URL should be of the form '/scholar?q=...'
@@ -304,7 +352,25 @@ def search_author_custom_url(self, url: str)->Author:
304352
:type url: string
305353
"""
306354
return self.__nav.search_authors(url)
307-
355+
356+
def get_related_articles(self, object: Publication)->_SearchScholarIterator:
357+
"""
358+
Search google scholar for related articles to a specific publication.
359+
360+
:param object: Publication object used to get the related articles
361+
:type object: Publication
362+
"""
363+
if object['container_type'] != 'Publication':
364+
print("Not a publication object")
365+
return
366+
367+
if object['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY:
368+
if 'url_related_articles' not in object.keys():
369+
object = self.fill(object)
370+
return self.__nav.search_publications(object['url_related_articles'])
371+
elif object['source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET:
372+
return self.__nav.search_publications(object['url_related_articles'])
373+
308374
def pprint(self, object: Author or Publication)->None:
309375
"""Pretty print an Author or Publication container object
310376

scholarly/author_parser.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def _fill_counts(self, soup, author):
108108
for c in soup.find_all('span', class_='gsc_g_al')]
109109
author['cites_per_year'] = dict(zip(years, cites))
110110

111-
def _fill_publications(self, soup, author):
111+
def _fill_publications(self, soup, author, publication_limit: int = 0):
112112
author['publications'] = list()
113113
pubstart = 0
114114
url_citations = _CITATIONAUTH.format(author['scholar_id'])
@@ -118,6 +118,8 @@ def _fill_publications(self, soup, author):
118118
for row in soup.find_all('tr', class_='gsc_a_tr'):
119119
new_pub = pub_parser.get_publication(row, PublicationSource.AUTHOR_PUBLICATION_ENTRY)
120120
author['publications'].append(new_pub)
121+
if (publication_limit) and (len(author['publications']) >= publication_limit):
122+
break
121123
if 'disabled' not in soup.find('button', id='gsc_bpf_more').attrs:
122124
pubstart += _PAGESIZE
123125
url = '{0}&cstart={1}&pagesize={2}'.format(
@@ -137,7 +139,7 @@ def _fill_coauthors(self, soup, author):
137139
new_coauthor['source'] = AuthorSource.CO_AUTHORS_LIST
138140
author['coauthors'].append(new_coauthor)
139141

140-
def fill(self, author, sections: list = []):
142+
def fill(self, author, sections: list = [], sortby="citedby", publication_limit: int = 0):
141143
"""Populate the Author with information from their profile
142144
143145
The `sections` argument allows for finer granularity of the profile
@@ -152,6 +154,10 @@ def fill(self, author, sections: list = []):
152154
* ``publications``: fills publications;
153155
* ``[]``: fills all of the above
154156
:type sections: ['basics','citations','counts','coauthors','publications',[]] list, optional
157+
:param sortby: Select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
158+
:type sortby: string
159+
:param publication_limit: Select the max number of publications you want you want to fill for the author. Defaults to no limit.
160+
:type publication_limit: int
155161
:returns: The filled object if fill was successfull, False otherwise.
156162
:rtype: Author or bool
157163
@@ -296,19 +302,25 @@ def fill(self, author, sections: list = []):
296302
"""
297303
try:
298304
sections = [section.lower() for section in sections]
305+
sortby_str = ''
306+
if sortby == "year":
307+
sortby_str = '&view_op=list_works&sortby=pubdate'
308+
elif sortby != "citedby":
309+
raise Exception("Please enter a valid sortby parameter. Options: 'year', 'citedby'")
299310
url_citations = _CITATIONAUTH.format(author['scholar_id'])
311+
url_citations += sortby_str
300312
url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE)
301313
soup = self.nav._get_soup(url)
302314

303315
if sections == []:
304316
for i in self._sections:
305317
if i not in author['filled']:
306-
getattr(self, f'_fill_{i}')(soup, author)
318+
(getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit))
307319
author['filled'].add(i)
308320
else:
309321
for i in sections:
310322
if i in self._sections and i not in author['filled']:
311-
getattr(self, f'_fill_{i}')(soup, author)
323+
(getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit))
312324
author['filled'].add(i)
313325
except Exception as e:
314326
raise(e)

scholarly/data_types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ class Publication(TypedDict, total=False):
149149
of multiple publications, and therefore may have multiple "citedby_id"
150150
values.
151151
(source: AUTHOR_PUBLICATION_ENTRY)
152+
:param url_related_articles: the url containing link for related articles of a publication (needs fill() for AUTHOR_PUBLICATION_ENTRIES)
152153
:param url_add_sclib: (source: PUBLICATION_SEARCH_SNIPPET)
153154
:param url_scholarbib: the url containing links for
154155
the BibTeX entry, EndNote, RefMan and RefWorks (source: PUBLICATION_SEARCH_SNIPPET)
@@ -169,6 +170,7 @@ class Publication(TypedDict, total=False):
169170
eprint_url: str
170171
pub_url: str
171172
url_add_sclib: str
173+
url_related_articles: str
172174
url_scholarbib: str
173175
filled: bool
174176
source: PublicationSource

scholarly/publication_parser.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,9 @@ def _scholar_pub(self, __data, publication: Publication):
234234
if 'Cited by' in link.text:
235235
publication['num_citations'] = int(re.findall(r'\d+', link.text)[0].strip())
236236
publication['citedby_url'] = link['href']
237+
238+
if 'Related articles' in link.text:
239+
publication['url_related_articles'] = link['href']
237240

238241
if __data.find('div', class_='gs_ggs gs_fl'):
239242
publication['eprint_url'] = __data.find(
@@ -257,7 +260,7 @@ def fill(self, publication: Publication)->Publication:
257260
for item in soup.find_all('div', class_='gs_scl'):
258261
key = item.find(class_='gsc_vcd_field').text.strip().lower()
259262
val = item.find(class_='gsc_vcd_value')
260-
if key == 'authors':
263+
if key == 'authors' or key == 'inventors':
261264
publication['bib']['author'] = ' and '.join(
262265
[i.strip() for i in val.text.split(',')])
263266
elif key == 'journal':
@@ -306,6 +309,10 @@ def fill(self, publication: Publication)->Publication:
306309
publication['cites_id'] = re.findall(
307310
_SCHOLARPUBRE, val.a['href'])[0]
308311
publication['citedby_url'] = _CITEDBYLINK.format(publication['cites_id'])
312+
elif key == 'scholar articles':
313+
for entry in val.find_all('a'):
314+
if entry.text.lower() == 'related articles':
315+
publication['url_related_articles'] = entry.get('href')[26:]
309316
# number of citation per year
310317
years = [int(y.text) for y in soup.find_all(class_='gsc_vcd_g_t')]
311318
cites = [int(c.text) for c in soup.find_all(class_='gsc_vcd_g_al')]

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name='scholarly',
8-
version='1.0.5',
8+
version='1.0.6',
99
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva',
1010
author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca',
1111
description='Simple access to Google Scholar authors and citations',

0 commit comments

Comments
 (0)