Skip to content

Commit df60fdc

Browse files
committed
Adapted universal item detail crawler for movie
1 parent e78b303 commit df60fdc

File tree

2 files changed

+48
-15
lines changed

2 files changed

+48
-15
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ One more thing, 我就特别需要标记为“不想看”的功能。
3030
原因有很多,比如我看了某个短片介绍,我觉得这个片子不好,标记一下雷区。这个也是刚需吧?
3131
反正自己来实现好了。
3232

33+
目前本项目爬出来/处理过的数据储存在 [mewx.github.io-Generator](https://github.com/MewX/mewx.github.io-Generator/tree/master/data/doubak) 中。
3334
我个人会在另一个项目里使用 [Gatsby.js](https://www.gatsbyjs.org/) 用这些爬到的数据生成 self-hosted 豆瓣,托管在 GitHub Pages 上。
3435
当然啦, [Archive.org](http://archive.org) 会是我的另一个备份。
3536

task/collector.go

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ func (task *Collector) Execute() error {
9494
task.crawlItemDetails(proto.Category_book, "li.subject-item > div.info > h2 > a")
9595
case proto.Category_movie.String():
9696
task.crawlMovieListDispatcher()
97-
// TODO: collect each movie details.
97+
task.crawlItemDetails(proto.Category_movie, "div.item > div.info > ul > li.title > a:nth-child(1)")
9898
case proto.Category_game.String():
9999
task.crawlGameListDispatcher()
100100
task.crawlItemDetails(proto.Category_game, "div.common-item > div.content > div.title > a:nth-child(1)")
@@ -452,7 +452,7 @@ func (task *Collector) crawlItemLists(cat proto.Category, totalItems int, pageSt
452452
}
453453

454454
func (task *Collector) crawlItemDetails(cat proto.Category, selector string) error {
455-
q := util.NewQueue()
455+
var urls []string
456456
inputFileNamePattern := fmt.Sprintf("*_%s_*.html", cat)
457457
files := util.GetFilePathListWithPattern(task.outputDir, inputFileNamePattern)
458458
for _, fn := range files {
@@ -461,29 +461,40 @@ func (task *Collector) crawlItemDetails(cat proto.Category, selector string) err
461461
log.Println("Error reading", fn, "with message", err)
462462
}
463463

464-
// TODO: handle subject-item missing div.info issue.
465464
doc.Find(selector).Each(func(_ int, sel *goquery.Selection) {
466465
url, exists := sel.Attr("href")
467466
if !exists {
468467
log.Fatal("Found item without link", sel.Text())
469468
}
470-
471-
// TODO: handle incremental option to check local file exists.
472-
q.AddURL(url)
469+
urls = append(urls, url)
473470
})
474471
}
475472

476-
size, _ := q.Size()
477-
log.Println("Detail queue size is:", size)
473+
// Hack around to continue progress. Set to the last downloaded progress count (1-based, 0 by default).
474+
// This hack will continue with the next URL in the queue.
475+
const iResume = 0
476+
q := util.NewQueue()
477+
for i := iResume; i < len(urls); i++ {
478+
// TODO: handle incremental option to check local file exists. Also need to update the progress counter.
479+
//re := regexp.MustCompile("[0-9]+")
480+
//id, _ := strconv.Atoi(re.FindString(urls[i]))
481+
//fileNamePattern := fmt.Sprintf("*_%s_%d.html", cat, id)
482+
//fs := util.GetFilePathListWithPattern(filepath.Join(task.outputDir, util.ItemPathPrefix), fileNamePattern)
483+
//if len(fs) == 0 {
484+
// log.Println("Found missing URL:", urls[i])
485+
// q.AddURL(urls[i])
486+
//}
487+
488+
q.AddURL(urls[i])
489+
}
490+
491+
qSize, _ := q.Size()
492+
log.Println("Detail queue size is:", qSize, "(out of", len(urls), "discovered URLs)")
478493

479-
count := 1
494+
// Reset counter (1-based).
495+
count := iResume + 1
480496
c := util.NewColly()
481497
c.OnResponse(func(r *colly.Response) {
482-
// Hack around to continue progress.
483-
if count < 0 {
484-
return
485-
}
486-
487498
// Extract ID from response (using the first occurrence of number string).
488499
re := regexp.MustCompile("[0-9]+")
489500
id, _ := strconv.Atoi(re.FindString(r.Request.URL.String()))
@@ -496,12 +507,33 @@ func (task *Collector) crawlItemDetails(cat proto.Category, selector string) err
496507
body := string(r.Body)
497508
util.FailIfNeedLogin(&body)
498509

499-
log.Println("Progress", count, "/", size)
510+
log.Println("Progress", count, "/", len(urls))
500511
count++
501512

502513
// TODO: replace this with a proper rate limiter.
503514
time.Sleep(util.RequestInterval)
504515
})
516+
c.OnError(func(r *colly.Response, err error) {
517+
t := string(r.Body)
518+
if strings.Contains(t, "页面不存在") {
519+
// Deleted page... Thank you GFW!
520+
// Example page: https://github.com/MewX/mewx.github.io-Generator/blob/master/data/doubak/collector/items/20221227.1551_movie_26575153.html
521+
log.Println("Page deleted by Douban:", r.Request.URL)
522+
523+
// I still want to save a copy of this page since it's managed by version control.
524+
// TODO: extract these codes to a common library.
525+
// Extract ID from response (using the first occurrence of number string).
526+
re := regexp.MustCompile("[0-9]+")
527+
id, _ := strconv.Atoi(re.FindString(r.Request.URL.String()))
528+
529+
fileName := fmt.Sprintf("%s_%s_%d.html", timePrefix, cat, id)
530+
if err := task.saveResponse(r, util.ItemPathPrefix+fileName); err != nil {
531+
log.Println(err.Error())
532+
}
533+
}
534+
// Still need to update the deterministic counter.
535+
count++
536+
})
505537
return q.Run(c)
506538
}
507539

0 commit comments

Comments
 (0)