Skip to content

Commit d6c95a7

Browse files
Merge pull request #240 from scholarly-python-package/develop
Develop
2 parents 226e772 + ba10e0e commit d6c95a7

File tree

3 files changed

+64
-4
lines changed

3 files changed

+64
-4
lines changed

scholarly/_navigator.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
class DOSException(Exception):
3535
"""DOS attack was detected."""
3636

37+
class MaxTriesExceededException(Exception):
38+
pass
3739

3840
class Singleton(type):
3941
_instances = {}
@@ -65,6 +67,10 @@ def set_logger(self, enable: bool):
6567

6668
self.logger.setLevel((logging.INFO if enable else logging.CRITICAL))
6769

70+
def set_timeout(self, timeout: int):
71+
"""Set timeout period in seconds for scholarly"""
72+
if timeout >= 0:
73+
self._TIMEOUT = timeout
6874

6975
def use_proxy(self, pg: ProxyGenerator):
7076
if pg is not None:
@@ -149,7 +155,7 @@ def _get_page(self, pagerequest: str) -> str:
149155

150156
tries += 1
151157
self._session, timeout = self.pm.get_next_proxy(num_tries = tries, old_timeout = timeout)
152-
raise Exception("Cannot fetch the page from Google Scholar.")
158+
raise MaxTriesExceededException("Cannot Fetch from Google Scholar.")
153159

154160

155161
def _set_retries(self, num_retries: int) -> None:
@@ -271,3 +277,28 @@ def search_author_id(self, id: str, filled: bool = False) -> Author:
271277
else:
272278
res = author_parser.fill(res, sections=['basics'])
273279
return res
280+
281+
def search_organization(self, url: str, fromauthor: bool) -> list:
282+
"""Generate instiution object from author search page.
283+
if no results are found and `fromuthor` is True, then use the first author from the search
284+
to get institution/organization name.
285+
"""
286+
soup = self._get_soup(url)
287+
rows = soup.find_all('h3', 'gsc_inst_res')
288+
if rows:
289+
self.logger.info("Found institution")
290+
291+
res = []
292+
for row in rows:
293+
res.append({'Organization': row.a.text, 'id': row.a['href'].split('org=', 1)[1]})
294+
295+
if rows == [] and fromauthor is True:
296+
try:
297+
auth = next(self.search_authors(url))
298+
authorg = self.search_author_id(auth.id).organization
299+
authorg['fromauthor'] = True
300+
res.append(authorg)
301+
except Exception:
302+
res = []
303+
304+
return res

scholarly/_scholarly.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,15 @@ def set_logger(self, enable: bool):
5151
"""
5252
self.__nav.set_logger(enable)
5353

54+
def set_timeout(self, timeout: int):
55+
"""Set timeout period in seconds for scholarly"""
56+
self.__nav.set_timeout(timeout)
57+
5458

5559
def search_pubs(self,
5660
query: str, patents: bool = True,
5761
citations: bool = True, year_low: int = None,
58-
year_high: int = None)->_SearchScholarIterator:
62+
year_high: int = None, sortby_date: str = None)->_SearchScholarIterator:
5963
"""Searches by query and returns a generator of Publication objects
6064
6165
:param query: terms to be searched
@@ -68,6 +72,8 @@ def search_pubs(self,
6872
:type year_low: int, optional
6973
:param year_high: maximum year of publication, defaults to None
7074
:type year_high: int, optional
75+
:param sortby_date: 'abstracts' for abstracts, 'everything' for all results
76+
:type sortyby_date: string, optional
7177
:returns: Generator of Publication objects
7278
:rtype: Iterator[:class:`Publication`]
7379
@@ -116,8 +122,14 @@ def search_pubs(self,
116122
yr_hi = '&as_yhi={0}'.format(year_high) if year_high is not None else ''
117123
citations = '&as_vis={0}'.format(1 - int(citations))
118124
patents = '&as_sdt={0},33'.format(1 - int(patents))
125+
sortby = ''
126+
127+
if sortby_date == 'abstract':
128+
sortby = '&scisbd=1'
129+
elif sortby_date == 'everything':
130+
sortby = '&scisbd=2'
119131
# improve str below
120-
url = url + yr_lo + yr_hi + citations + patents
132+
url = url + yr_lo + yr_hi + citations + patents + sortby
121133
return self.__nav.search_publications(url)
122134

123135
def search_single_pub(self, pub_title: str, filled: bool = False)->PublicationParser:
@@ -317,3 +329,20 @@ def pprint(self, object: Author or Publication)->None:
317329
del to_print['container_type']
318330
print(pprint.pformat(to_print))
319331

332+
def search_org(self, name: str, fromauthor: bool = False) -> list:
333+
"""Search by organization name and return a list of possible disambiguations
334+
:Example::
335+
.. testcode::
336+
search_query = scholarly.search_org('ucla')
337+
print(search_query)
338+
:Output::
339+
.. testoutput::
340+
[{'Organization': 'University of California, Los Angeles',
341+
'id': '14108176128635076915'},
342+
{'Organization': 'Universidad Centroccidental Lisandro Alvarado',
343+
'id': '9670678584336165373'}
344+
]
345+
"""
346+
347+
url = _AUTHSEARCH.format(requests.utils.quote(name))
348+
return self.__nav.search_organization(url, fromauthor)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name='scholarly',
8-
version='1.0.3',
8+
version='1.0.4',
99
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva',
1010
author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca',
1111
description='Simple access to Google Scholar authors and citations',

0 commit comments

Comments
 (0)