@@ -94,7 +94,7 @@ func (task *Collector) Execute() error {
9494 task .crawlItemDetails (proto .Category_book , "li.subject-item > div.info > h2 > a" )
9595 case proto .Category_movie .String ():
9696 task .crawlMovieListDispatcher ()
97- // TODO: collect each movie details.
97+ task . crawlItemDetails ( proto . Category_movie , "div.item > div.info > ul > li.title > a:nth-child(1)" )
9898 case proto .Category_game .String ():
9999 task .crawlGameListDispatcher ()
100100 task .crawlItemDetails (proto .Category_game , "div.common-item > div.content > div.title > a:nth-child(1)" )
@@ -452,7 +452,7 @@ func (task *Collector) crawlItemLists(cat proto.Category, totalItems int, pageSt
452452}
453453
454454func (task * Collector ) crawlItemDetails (cat proto.Category , selector string ) error {
455- q := util . NewQueue ()
455+ var urls [] string
456456 inputFileNamePattern := fmt .Sprintf ("*_%s_*.html" , cat )
457457 files := util .GetFilePathListWithPattern (task .outputDir , inputFileNamePattern )
458458 for _ , fn := range files {
@@ -461,29 +461,40 @@ func (task *Collector) crawlItemDetails(cat proto.Category, selector string) err
461461 log .Println ("Error reading" , fn , "with message" , err )
462462 }
463463
464- // TODO: handle subject-item missing div.info issue.
465464 doc .Find (selector ).Each (func (_ int , sel * goquery.Selection ) {
466465 url , exists := sel .Attr ("href" )
467466 if ! exists {
468467 log .Fatal ("Found item without link" , sel .Text ())
469468 }
470-
471- // TODO: handle incremental option to check local file exists.
472- q .AddURL (url )
469+ urls = append (urls , url )
473470 })
474471 }
475472
476- size , _ := q .Size ()
477- log .Println ("Detail queue size is:" , size )
473+ // Hack around to continue progress. Set to the last downloaded progress count (1-based, 0 by default).
474+ // This hack will continue with the next URL in the queue.
475+ const iResume = 0
476+ q := util .NewQueue ()
477+ for i := iResume ; i < len (urls ); i ++ {
478+ // TODO: handle incremental option to check local file exists. Also need to update the progress counter.
479+ //re := regexp.MustCompile("[0-9]+")
480+ //id, _ := strconv.Atoi(re.FindString(urls[i]))
481+ //fileNamePattern := fmt.Sprintf("*_%s_%d.html", cat, id)
482+ //fs := util.GetFilePathListWithPattern(filepath.Join(task.outputDir, util.ItemPathPrefix), fileNamePattern)
483+ //if len(fs) == 0 {
484+ // log.Println("Found missing URL:", urls[i])
485+ // q.AddURL(urls[i])
486+ //}
487+
488+ q .AddURL (urls [i ])
489+ }
490+
491+ qSize , _ := q .Size ()
492+ log .Println ("Detail queue size is:" , qSize , "(out of" , len (urls ), "discovered URLs)" )
478493
479- count := 1
494+ // Reset counter (1-based).
495+ count := iResume + 1
480496 c := util .NewColly ()
481497 c .OnResponse (func (r * colly.Response ) {
482- // Hack around to continue progress.
483- if count < 0 {
484- return
485- }
486-
487498 // Extract ID from response (using the first occurrence of number string).
488499 re := regexp .MustCompile ("[0-9]+" )
489500 id , _ := strconv .Atoi (re .FindString (r .Request .URL .String ()))
@@ -496,12 +507,33 @@ func (task *Collector) crawlItemDetails(cat proto.Category, selector string) err
496507 body := string (r .Body )
497508 util .FailIfNeedLogin (& body )
498509
499- log .Println ("Progress" , count , "/" , size )
510+ log .Println ("Progress" , count , "/" , len ( urls ) )
500511 count ++
501512
502513 // TODO: replace this with a proper rate limiter.
503514 time .Sleep (util .RequestInterval )
504515 })
516+ c .OnError (func (r * colly.Response , err error ) {
517+ t := string (r .Body )
518+ if strings .Contains (t , "页面不存在" ) {
519+ // Deleted page... Thank you GFW!
520+ // Example page: https://github.com/MewX/mewx.github.io-Generator/blob/master/data/doubak/collector/items/20221227.1551_movie_26575153.html
521+ log .Println ("Page deleted by Douban:" , r .Request .URL )
522+
523+ // I still want to save a copy of this page since it's managed by version control.
524+ // TODO: extract these codes to a common library.
525+ // Extract ID from response (using the first occurrence of number string).
526+ re := regexp .MustCompile ("[0-9]+" )
527+ id , _ := strconv .Atoi (re .FindString (r .Request .URL .String ()))
528+
529+ fileName := fmt .Sprintf ("%s_%s_%d.html" , timePrefix , cat , id )
530+ if err := task .saveResponse (r , util .ItemPathPrefix + fileName ); err != nil {
531+ log .Println (err .Error ())
532+ }
533+ }
534+ // Still need to update the deterministic counter.
535+ count ++
536+ })
505537 return q .Run (c )
506538}
507539
0 commit comments