ProfOlaf/11_download_pdfs.py at main · sr-lab/ProfOlaf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import argparse
import csv
import time
import pathlib
import sys

from utils.article_processing.download_pdfs import download_pdf, is_valid_pdf


def main():
    parser = argparse.ArgumentParser(description='Download PDFs')
    parser.add_argument('--csv_file', help='CSV file', type=str, default=search_conf["csv_path"])
    parser.add_argument('--article_folder', help='Output folder', type=str, default=analysis_conf["articles_folder"])
    args = parser.parse_args()
    csv_file = args.csv_file
    output_folder = args.article_folder

    if not os.path.exists(csv_file):
        print(f"CSV file not found: {csv_file}")
        sys.exit(1)

    pathlib.Path(output_folder).mkdir(parents=True, exist_ok=True)

    failed_downloads = []
    print(f"Downloading PDFs from {csv_file} to {output_folder}")
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)

        for row in reader:
            article_id = row.get('title', '').strip()
            eprint_url = row.get('url', '').strip()

            if not eprint_url:
                continue

            article_id = article_id.replace(" ", "_").replace(":", "_").replace("/", "_").replace("\\", "_").replace("*", "_").replace("?", "_").replace("\"", "_").replace("<", "_").replace(">", "_").replace(".", "_")
            output_file = os.path.join(output_folder, f"{article_id}.pdf")

            if os.path.exists(output_file):
                print(f"File already exists: {output_file}")
                continue

            print(f"Downloading {article_id}.pdf from {eprint_url}")

            if download_pdf(eprint_url, output_file):
                if is_valid_pdf(output_file):
                    print(f"Successfully downloaded and verified: {output_file}")
                    time.sleep(1)
                else:
                    print(f"Downloaded but invalid PDF: {output_file}")
                    os.remove(output_file)
                    failed_downloads.append((article_id, eprint_url))
            else:
                failed_downloads.append((article_id, eprint_url))

    if failed_downloads:
        print("\nFailed downloads:")
        for article_id, url in failed_downloads:
            print(f"\nID {article_id}: {url}")
            print(f"Please manually download and save as: {output_folder}/{article_id}.pdf")

            while True:
                response = input("Have you completed the download? (y/n): ").lower().strip()
                if response in ['y', 'yes']:
                    file_path = os.path.join(output_folder, f"{article_id}.pdf")
                    if os.path.exists(file_path) and is_valid_pdf(file_path):
                        print(f"✓ File {article_id}.pdf verified successfully!")
                        break
                    else:
                        if os.path.exists(file_path):
                            print(f"✗ File {article_id}.pdf exists but is not a valid PDF. Please ensure it's a valid PDF file.")
                        else:
                            print(f"✗ File {article_id}.pdf not found. Please ensure it's saved correctly.")
                        continue
                elif response in ['n', 'no']:
                    print("Please complete the download and try again.")
                    continue
                else:
                    print("Please answer with 'y' or 'n'.")

    print(f"\nDownload complete. Files saved to: {output_folder}")

if __name__ == "__main__":
    main()