Skip to content

增加sitemap.xml(网站地图)解析功能,其读取主要和robots.txt差不多  #143

@sairson

Description

@sairson

增加两个结构体用于sitemap.xml内容解析

type Sitemap struct {
	URLs    []LocUrl `xml:"url"`
	Sitemap []LocUrl `xml:"sitemap"`
}

type LocUrl struct {
	Loc string `xml:"loc"`
}

之后在获取的返回包body后

sitemap := Sitemap{}
	if err := xml.NewDecoder(strings.NewReader(resp.ToText())).Decode(&sitemap); err != nil {
		return result, errors.Wrap(err, "could not decode xml")
	}
	for _, v := range sitemap.URLs {
		url, err := urllib.GetURL(regexp.MustCompile(`(/.+)`).FindString(strings.Trim(v.Loc, " \t\n")), *navRequest.URL)
		if err != nil {
			continue
		}
		request := parse.GetRequest(enums.GET, url)
		request.Source = enums.FromSitemap
		_ = callback(request)
		result = append(result, request)
	}

	for _, v := range sitemap.Sitemap {
		url, err := urllib.GetURL(regexp.MustCompile(`(/.+)`).FindString(strings.Trim(v.Loc, " \t\n")), *navRequest.URL)
		if err != nil {
			continue
		}
		request := parse.GetRequest(enums.GET, url)
		request.Source = enums.FromSitemap
		_ = callback(request)
		result = append(result, request)
	}
	return result, nil

Metadata

Metadata

Assignees

No one assigned

    Labels

    featureNew feature or request

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions