Skip to content

Commit 720a522

Browse files
authored
Merge pull request #341 from scholarly-python-package/develop
Releasing 1.4.2
2 parents b0aa82a + e3ba7e5 commit 720a522

File tree

8 files changed

+227
-101
lines changed

8 files changed

+227
-101
lines changed

.github/workflows/pythonpackage.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,5 +47,6 @@ jobs:
4747
PASSWORD: ${{ secrets.PASSWORD }}
4848
PORT: ${{ secrets.PORT }}
4949
USERNAME: ${{ secrets.USERNAME }}
50+
SCRAPER_API_KEY: ${{ secrets.SCRAPER_API_KEY }}
5051
run: |
5152
python3 -m unittest -v test_module.py

docs/quickstart.rst

Lines changed: 72 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ Search by keyword and return a generator of Author objects.
123123
'source': 'SEARCH_AUTHOR_SNIPPETS',
124124
'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=lHrs3Y4AAAAJ'}
125125
126-
``search_pubs``
126+
``search_pubs``
127127
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
128128
Search for articles/publications and return generator of Publication objects.
129129
#############################################################################
@@ -363,7 +363,8 @@ Using proxies
363363
-------------
364364

365365
In general, Google Scholar does not like bots, and can often block
366-
scholarly. We are actively working towards making scholarly more robust
366+
scholarly, especially those pages that contain ``scholar?`` in the URL.
367+
We are actively working towards making scholarly more robust
367368
towards that front.
368369

369370
The most common solution for avoiding network issues is to use proxies
@@ -390,12 +391,18 @@ come from the ProxyGenerator class:
390391
- Tor\_Internal()
391392
- Tor\_External()
392393
- Luminati()
394+
- ScraperAPI()
393395
- FreeProxies()
394-
- SingleProxy() Example:
396+
- SingleProxy()
397+
398+
All of these methods return ``True`` if the proxy was setup successfully which
399+
you can check before beginning to use it with the ``use_proxy`` method.
400+
401+
Example:
395402

396403
.. code:: python
397404
398-
pg.SingleProxy(http = <your http proxy>, https = <your https proxy>)
405+
success = pg.SingleProxy(http = <your http proxy>, https = <your https proxy>)
399406
400407
Finally set scholarly to use this proxy for your actions
401408

@@ -438,7 +445,7 @@ default password, but you may want to change it for your installation.)
438445
from scholarly import scholarly, ProxyGenerator
439446
440447
pg = ProxyGenerator()
441-
pg.Tor_External(tor_sock_port=9050, tor_control_port=9051, tor_password="scholarly_password")
448+
success = pg.Tor_External(tor_sock_port=9050, tor_control_port=9051, tor_password="scholarly_password")
442449
scholarly.use_proxy(pg)
443450
444451
author = next(scholarly.search_author('Steven A Cholewiak'))
@@ -458,26 +465,7 @@ executable in your system.
458465
from scholarly import scholarly, ProxyGenerator
459466
460467
pg = ProxyGenerator()
461-
pg.Tor_Internal(tor_cmd = "tor")
462-
scholarly.use_proxy(pg)
463-
464-
author = next(scholarly.search_author('Steven A Cholewiak'))
465-
scholarly.pprint(author)
466-
467-
``FreeProxies``
468-
^^^^^^^^^^^^^^^^^^^^
469-
pg.FreeProxies()
470-
################
471-
472-
This uses the ``free-proxy`` pip library to add a proxy to your
473-
configuration.
474-
475-
.. code:: python
476-
477-
from scholarly import scholarly, ProxyGenerator
478-
479-
pg = ProxyGenerator()
480-
pg.FreeProxies()
468+
success = pg.Tor_Internal(tor_cmd = "tor")
481469
scholarly.use_proxy(pg)
482470
483471
author = next(scholarly.search_author('Steven A Cholewiak'))
@@ -502,7 +490,7 @@ You can use your own configuration
502490

503491
.. code:: python
504492
505-
pg.Luminati(usr= "your_username",passwd ="your_password", port = "your_port" )
493+
success = pg.Luminati(usr= "your_username",passwd ="your_password", port = "your_port" )
506494
507495
Or alternatively you can use the environment variables set in your .env
508496
file
@@ -519,6 +507,61 @@ file
519507
author = next(scholarly.search_author('Steven A Cholewiak'))
520508
scholarly.pprint(author)
521509
510+
``ScraperAPI``
511+
^^^^^^^^^^^^^^
512+
pg.ScraperAPI()
513+
###############
514+
515+
.. code:: python
516+
517+
from scholarly import scholarly, ProxyGenerator
518+
519+
pg = ProxyGenerator()
520+
521+
You will have to provide your ScraperAPI key
522+
523+
.. code:: python
524+
525+
success = pg.ScraperAPI(YOUR_SCRAPER_API_KEY)
526+
527+
Or alternatively you can use the environment variables as in the case of Luminati example.
528+
529+
If you have Startup or higher paid plans, you can use additional options that are allowed for your plan.
530+
531+
.. code:: python
532+
533+
success = pg.ScraperAPI(YOUR_SCRAPER_API_KEY, country_code='fr', premium=True, render=True)
534+
535+
See https://www.scraperapi.com/pricing/ to see which options are enable for your plan.
536+
537+
Finally, you can route your query through the ScraperAPI proxy
538+
539+
.. code:: python
540+
541+
scholarly.use_proxy(pg)
542+
543+
author = next(scholarly.search_author('Steven A Cholewiak'))
544+
scholarly.pprint(author)
545+
546+
``FreeProxies``
547+
^^^^^^^^^^^^^^^^^^^^
548+
pg.FreeProxies()
549+
################
550+
551+
This uses the ``free-proxy`` pip library to add a proxy to your
552+
configuration.
553+
554+
.. code:: python
555+
556+
from scholarly import scholarly, ProxyGenerator
557+
558+
pg = ProxyGenerator()
559+
success = pg.FreeProxies()
560+
scholarly.use_proxy(pg)
561+
562+
author = next(scholarly.search_author('Steven A Cholewiak'))
563+
scholarly.pprint(author)
564+
522565
``SingleProxy``
523566
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
524567
pg.SingleProxy(http: str, https:str)
@@ -531,7 +574,7 @@ If you want to use a proxy of your choice, feel free to use this option.
531574
from scholarly import scholarly, ProxyGenerator
532575
533576
pg = ProxyGenerator()
534-
pg.SingleProxy(http = <your http proxy>, https = <your https proxy>)
577+
success = pg.SingleProxy(http = <your http proxy>, https = <your https proxy>)
535578
scholarly.use_proxy(pg)
536579
537580
author = next(scholarly.search_author('Steven A Cholewiak'))
@@ -556,7 +599,8 @@ the working directory of the ``test_module.py`` as:
556599
557600
Define the connection method for the Tests, among these options:
558601

559-
- luminati (if you have a luminati proxy service)
602+
- luminati (if you have a Luminati proxy service)
603+
- scraperapi (if you have a ScraperAPI proxy service)
560604
- freeproxy
561605
- tor
562606
- tor\_internal

scholarly/_navigator.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class DOSException(Exception):
3535
"""DOS attack was detected."""
3636

3737
class MaxTriesExceededException(Exception):
38-
pass
38+
"""Maximum number of tries by scholarly reached"""
3939

4040
class Singleton(type):
4141
_instances = {}
@@ -82,27 +82,28 @@ def _new_session(self):
8282
self.got_403 = False
8383
self._session = self.pm._new_session()
8484

85-
85+
8686
def _get_page(self, pagerequest: str) -> str:
8787
"""Return the data from a webpage
8888
8989
:param pagerequest: the page url
9090
:type pagerequest: str
9191
:returns: the text from a webpage
9292
:rtype: {str}
93-
:raises: Exception
93+
:raises: MaxTriesExceededException, DOSException
9494
"""
9595
self.logger.info("Getting %s", pagerequest)
9696
resp = None
9797
tries = 0
98+
if self.pm._use_scraperapi:
99+
self.set_timeout(60)
98100
timeout=self._TIMEOUT
99101
while tries < self._max_retries:
100102
try:
101103
w = random.uniform(1,2)
102104
time.sleep(w)
103-
104105
resp = self._session.get(pagerequest, timeout=timeout)
105-
self.logger.info("Session proxy config is {}".format(self._session.proxies))
106+
self.logger.debug("Session proxy config is {}".format(self._session.proxies))
106107

107108
has_captcha = self._requests_has_captcha(resp.text)
108109

@@ -125,7 +126,7 @@ def _get_page(self, pagerequest: str) -> str:
125126
time.sleep(w)
126127
self._new_session()
127128
self.got_403 = True
128-
129+
129130
continue # Retry request within same session
130131
else:
131132
self.logger.info("We can use another connection... let's try that.")
@@ -215,7 +216,7 @@ def _get_soup(self, url: str) -> BeautifulSoup:
215216
def search_authors(self, url: str)->Author:
216217
"""Generator that returns Author objects from the author search page"""
217218
soup = self._get_soup(url)
218-
219+
219220
author_parser = AuthorParser(self)
220221
while True:
221222
rows = soup.find_all('div', 'gsc_1usr')

0 commit comments

Comments
 (0)