Skip to content

Commit 7fe6ba6

Browse files
authored
Merge pull request #319 from scholarly-python-package/develop
Releasing 1.3.0
2 parents 890a19a + 828fdee commit 7fe6ba6

File tree

4 files changed

+66
-30
lines changed

4 files changed

+66
-30
lines changed

scholarly/_scholarly.py

Lines changed: 43 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
_KEYWORDSEARCH = '/citations?hl=en&view_op=search_authors&mauthors=label:{0}'
1717
_KEYWORDSEARCHBASE = '/citations?hl=en&view_op=search_authors&mauthors={}'
1818
_PUBSEARCH = '/scholar?hl=en&q={0}'
19+
_CITEDBYSEARCH = '/scholar?hl=en&cites={0}'
1920

2021

2122
class _Scholarly:
@@ -123,29 +124,18 @@ def search_pubs(self,
123124
'url_scholarbib': '/scholar?q=info:K8ZpoI6hZNoJ:scholar.google.com/&output=cite&scirp=0&hl=en'}
124125
125126
"""
126-
url = _PUBSEARCH.format(requests.utils.quote(query))
127-
128-
yr_lo = '&as_ylo={0}'.format(year_low) if year_low is not None else ''
129-
yr_hi = '&as_yhi={0}'.format(year_high) if year_high is not None else ''
130-
citations = '&as_vis={0}'.format(1 - int(citations))
131-
patents = '&as_sdt={0},33'.format(1 - int(patents))
132-
sortby = ''
133-
start = '&start={0}'.format(start_index) if start_index > 0 else ''
134-
135-
if sort_by == "date":
136-
if include_last_year == "abstracts":
137-
sortby = '&scisbd=1'
138-
elif include_last_year == "everything":
139-
sortby = '&scisbd=2'
140-
else:
141-
print("Invalid option for 'include_last_year', available options: 'everything', 'abstracts'")
142-
return
143-
elif sort_by != "relevance":
144-
print("Invalid option for 'sort_by', available options: 'relevance', 'date'")
145-
return
146-
147-
# improve str below
148-
url = url + yr_lo + yr_hi + citations + patents + sortby + start
127+
url = _construct_url(_PUBSEARCH.format(requests.utils.quote(query)), patents=patents, citations=citations, year_low=year_low, year_high=year_high)
128+
return self.__nav.search_publications(url)
129+
130+
def search_citedby(self, publication_id: int, **kwargs):
131+
"""Searches by Google Scholar publication id and returns a generator of Publication objects.
132+
133+
:param publication_id: Google Scholar publication id
134+
:type publication_id: int
135+
136+
For the remaining parameters, see documentation of `search_pubs`.
137+
"""
138+
url = _construct_url(_CITEDBYSEARCH.format(str(publication_id)), **kwargs)
149139
return self.__nav.search_publications(url)
150140

151141
def search_single_pub(self, pub_title: str, filled: bool = False)->PublicationParser:
@@ -430,3 +420,33 @@ def search_org(self, name: str, fromauthor: bool = False) -> list:
430420

431421
url = _AUTHSEARCH.format(requests.utils.quote(name))
432422
return self.__nav.search_organization(url, fromauthor)
423+
424+
def _construct_url(baseurl: str, patents: bool = True,
425+
citations: bool = True, year_low: int = None,
426+
year_high: int = None, sort_by: str = "relevance",
427+
include_last_year: str = "abstracts",
428+
start_index: int = 0)-> str:
429+
"""Construct URL from requested parameters."""
430+
url = baseurl
431+
432+
yr_lo = '&as_ylo={0}'.format(year_low) if year_low is not None else ''
433+
yr_hi = '&as_yhi={0}'.format(year_high) if year_high is not None else ''
434+
citations = '&as_vis={0}'.format(1 - int(citations))
435+
patents = '&as_sdt={0},33'.format(1 - int(patents))
436+
sortby = ''
437+
start = '&start={0}'.format(start_index) if start_index > 0 else ''
438+
439+
if sort_by == "date":
440+
if include_last_year == "abstracts":
441+
sortby = '&scisbd=1'
442+
elif include_last_year == "everything":
443+
sortby = '&scisbd=2'
444+
else:
445+
print("Invalid option for 'include_last_year', available options: 'everything', 'abstracts'")
446+
return
447+
elif sort_by != "relevance":
448+
print("Invalid option for 'sort_by', available options: 'relevance', 'date'")
449+
return
450+
451+
# improve str below
452+
return url + yr_lo + yr_hi + citations + patents + sortby + start

scholarly/publication_parser.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,11 @@ def _load_url(self, url: str):
6262

6363
def _get_total_results(self):
6464
for x in self._soup.find_all('div', class_='gs_ab_mdw'):
65-
# Decimal separator is set by Google independent of language setting
66-
match = re.match(pattern=r'(^|\s*About)\s*([0-9,\.]+)', string=x.text)
65+
# Accounting for different thousands separators:
66+
# comma, dot, space, apostrophe
67+
match = re.match(pattern=r'(^|\s*About)\s*([0-9,\.\s’]+)', string=x.text)
6768
if match:
68-
return int(re.sub(pattern=r'[,\.]',repl='', string=match.group(2)))
69+
return int(re.sub(pattern=r'[,\.\s’]',repl='', string=match.group(2)))
6970
return None
7071

7172
# Iterator protocol

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name='scholarly',
8-
version='1.2.2',
8+
version='1.3.0',
99
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva',
1010
author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca',
1111
description='Simple access to Google Scholar authors and citations',

test_module.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,19 @@ def test_search_pubs_citedby(self):
117117
cites = [c for c in scholarly.citedby(filled)]
118118
self.assertEqual(len(cites), filled['num_citations'])
119119

120+
def test_search_pubs_citedby_id(self):
121+
"""
122+
Test querying for citations by paper ID.
123+
124+
The 'Machine-learned epidemiology' paper had 11 citations as of
125+
June 1, 2020.
126+
"""
127+
# Machine-learned epidemiology: real-time detection of foodborne illness at scale
128+
publication_id = 2244396665447968936
129+
130+
pubs = [p for p in scholarly.search_citedby(publication_id)]
131+
self.assertGreaterEqual(len(pubs), 11)
132+
120133
def test_search_keyword(self):
121134
"""
122135
When we search for the keyword "3d_shape" the author
@@ -186,13 +199,17 @@ def test_search_pubs(self):
186199
def test_search_pubs_total_results(self):
187200
"""
188201
As of February 4, 2021 there are 32 pubs that fit the search term:
189-
["naive physics" stability "3d shape"].
202+
["naive physics" stability "3d shape"], and 17'000 results that fit
203+
the search term ["WIEN2k Blaha"].
190204
191205
Check that the total results for that search term equals 32.
192206
"""
193207
pubs = scholarly.search_pubs('"naive physics" stability "3d shape"')
194208
self.assertGreaterEqual(pubs.total_results, 32)
195209

210+
pubs = scholarly.search_pubs('WIEN2k Blaha')
211+
self.assertGreaterEqual(pubs.total_results, 10000)
212+
196213
def test_search_pubs_filling_publication_contents(self):
197214
'''
198215
This process checks the process of filling a publication that is derived
@@ -232,7 +249,5 @@ def test_extract_author_id_list(self):
232249
self.assertTrue(author_id_list[3] == 'TEndP-sAAAAJ')
233250

234251

235-
236-
237252
if __name__ == '__main__':
238253
unittest.main()

0 commit comments

Comments
 (0)