Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions example/webarchive_example/strategies.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime
from typing import List, Optional

from scrapy_webarchive.models import FileInfo
from scrapy_webarchive.strategies import StrategyRegistry

Expand Down
6 changes: 3 additions & 3 deletions scrapy_webarchive/cdxj/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re
from dataclasses import dataclass, field

from typing_extensions import TYPE_CHECKING
from typing_extensions import TYPE_CHECKING, Union

if TYPE_CHECKING:
from scrapy_webarchive.wacz.wacz_file import WaczFile
Expand Down Expand Up @@ -41,13 +41,13 @@ def _parse(line: str):
return CDXREC.match(line)

@classmethod
def from_cdxline(cls, cdxline: str, wacz_file: "WaczFile"):
def from_cdxline(cls, cdxline: str, wacz_file: "WaczFile") -> Union[CdxjRecord, None]:
"""Creates a CdxjRecord instance from a CDX(J) line."""

m = cls._parse(cdxline.strip())

if not m:
raise ValueError(f"Invalid CDXJ line: '{cdxline.strip()}'")
return None

parsed_data = m.groupdict(default="")
parsed_data['data'] = json.loads(parsed_data['data'])
Expand Down
5 changes: 3 additions & 2 deletions scrapy_webarchive/wacz/wacz_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ def _parse_index(self, index_file: Union[gzip.GzipFile, IO]) -> Dict[str, List[C

for line in index_file:
cdxj_record = CdxjRecord.from_cdxline(line.decode(), wacz_file=self)
cdxj_records[cdxj_record.data["url"]].append(cdxj_record)
if cdxj_record:
cdxj_records[cdxj_record.data["url"]].append(cdxj_record)

return cdxj_records

Expand Down Expand Up @@ -131,4 +132,4 @@ def iter_index(self) -> Generator[CdxjRecord, None, None]:
record has its `wacz_file` attribute set to the corresponding WACZ file.
"""

yield from (cdxj_record for wacz in self.waczs for cdxj_record in wacz.iter_index())
yield from (cdxj_record for wacz in self.waczs for cdxj_record in wacz.iter_index() if cdxj_record)
30 changes: 17 additions & 13 deletions tests/test_cdxj.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import pytest

from scrapy_webarchive.cdxj.models import CdxjRecord

Expand Down Expand Up @@ -26,30 +25,35 @@ def test_cdxj_record_invalid_format():
# Invalid CDXJ line (missing date)
invalid_cdxj_line = "com,example)/index {\"url\": \"http://example.com/index\", \"status\": \"200\"}"

# Test that the invalid line raises a ValueError
with pytest.raises(ValueError, match=r"Invalid CDXJ line:"):
CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None)
# Test that the invalid line raises returns None
assert CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None) is None


def test_cdxj_record_invalid_json_data():
# Invalid JSON in CDXJ line
invalid_cdxj_line = "com,example)/index 20241003000000 {\"url\": \"http://example.com/index\", \"status\": \"200\""

# Test that the invalid JSON raises a ValueError
with pytest.raises(ValueError):
CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None)
# Test that the invalid JSON returns None
assert CdxjRecord.from_cdxline(invalid_cdxj_line, wacz_file=None) is None


def test_cdxj_record_empty_line():
# Test that an empty line raises a ValueError
with pytest.raises(ValueError, match=r"Invalid CDXJ line:"):
CdxjRecord.from_cdxline('', wacz_file=None)
# Test that an empty line returns None
assert CdxjRecord.from_cdxline('', wacz_file=None) is None


def test_cdxj_record_no_data_field():
# CDXJ line with no data field
no_data_cdxj_line = "com,example)/index 20241003000000"

# Test that no data field raises a ValueError
with pytest.raises(ValueError, match=r"Invalid CDXJ line:"):
CdxjRecord.from_cdxline(no_data_cdxj_line, wacz_file=None)
# Test that no data field returns None
assert CdxjRecord.from_cdxline(no_data_cdxj_line, wacz_file=None) is None


def test_cdxj_record_urn_pageinfo():
# CDXJ line starting with urn:pageinfo
url = "urn:pageinfo:https://example.com/index"
pageinfo_cdxj_line = url + " 20241003000000 {\"url\": \"" + url + "\" }"

# Test that the urn:pageinfo line returns None
assert CdxjRecord.from_cdxline(pageinfo_cdxj_line, wacz_file=None) is None
Loading