1+ import pandas as pd
2+ from google .oauth2 .credentials import Credentials
3+ from googleapiclient .discovery import build
4+ from googleapiclient .http import MediaFileUpload , MediaIoBaseDownload
5+ from google_auth_oauthlib .flow import InstalledAppFlow
6+ from google .auth .transport .requests import Request
7+ import os
8+ import io
9+ import json
10+ from pymongo .mongo_client import MongoClient
11+ from tqdm import tqdm
12+
13+
14+ def get_drive_service ():
15+ SCOPES = ['https://www.googleapis.com/auth/drive.readonly' ] # Google Drive API 스코프 설정
16+ creds = None
17+ if os .path .exists ('token.json' ):
18+ creds = Credentials .from_authorized_user_file ('token.json' , SCOPES )
19+ if not creds or not creds .valid :
20+ if creds and creds .expired and creds .refresh_token :
21+ creds .refresh (Request ())
22+ else :
23+ flow = InstalledAppFlow .from_client_secrets_file (
24+ "/Users/seyeon/Desktop/데이터구축/TableMagnifier/info/client.json" ,
25+ SCOPES
26+ )
27+ creds = flow .run_local_server (
28+ host = "localhost" ,
29+ port = 8080 ,
30+ open_browser = True
31+ )
32+
33+ with open ('token.json' , 'w' ) as token :
34+ token .write (creds .to_json ())
35+ return build ('drive' , 'v3' , credentials = creds )
36+
37+
38+ def find_root_folder (service ):
39+ root_folders = service .files ().list (
40+ q = "'root' in parents and mimeType='application/vnd.google-apps.folder' and trashed=false" ,
41+ fields = "files(id,name)"
42+ ).execute ().get ("files" , [])
43+
44+ return root_folders
45+
46+
47+ def child_folders (service , folder_id , page_size = 200 ):
48+ q = f"'{ folder_id } ' in parents and trashed = false"
49+ res = service .files ().list (
50+ q = q ,
51+ pageSize = page_size ,
52+ fields = "files(id,name,mimeType)"
53+ ).execute ()
54+ return res .get ("files" , [])
55+
56+
57+ def download_file_bytes (service , file_id ):
58+ # fileID에 해당하는 이미지를 byte 형태로 read
59+ request = service .files ().get_media (fileId = file_id )
60+ fh = io .BytesIO ()
61+ downloader = MediaIoBaseDownload (fh , request )
62+ done = False
63+ while not done :
64+ status , done = downloader .next_chunk ()
65+ fh .seek (0 )
66+ return fh .read ()
67+
68+
69+ def list_all_files_in_folder (service , folder_id ):
70+ files = []
71+ page_token = None
72+ while True :
73+ res = service .files ().list (
74+ q = (
75+ f"'{ folder_id } ' in parents and "
76+ "mimeType != 'application/vnd.google-apps.folder' and "
77+ "trashed = false"
78+ ),
79+ fields = "nextPageToken, files(id,name,mimeType)" ,
80+ pageToken = page_token ,
81+ pageSize = 1000
82+ ).execute ()
83+
84+ files .extend (res .get ("files" , []))
85+ page_token = res .get ("nextPageToken" )
86+
87+ if not page_token :
88+ break
89+
90+ return files
91+
92+
93+ def save_image_to_local (file_bytes , file_path ):
94+ """바이트 데이터를 이미지 파일로 저장"""
95+ with open (file_path , 'wb' ) as f :
96+ f .write (file_bytes )
97+ print (f"이미지 저장: { file_path } " )
98+
99+
100+ def mongo_client (PASSWORD , collection_name ):
101+ URI = f"mongodb+srv://TableMagnifier:{ PASSWORD } @tablemagnifier.gf5mkkc.mongodb.net/?appName=TableMagnifier"
102+ mongo_client = MongoClient (
103+ URI ,
104+ tls = True ,
105+ tlsAllowInvalidCertificates = True
106+ )
107+
108+ # Database 선택
109+ db = mongo_client ['TableInformation' ]
110+ # Collection 선택 : Academic, Business, Finance, Insurance, Medical, Public
111+ collection = db [f'{ collection_name } ' ]
112+ return collection
113+
114+ def get_file_information (file_id ):
115+ file_info = service .files ().get (
116+ fileId = file_id ,
117+ fields = "id,name,mimeType"
118+ ).execute ()
119+ return file_info
120+
121+ def table_json_format ():
122+ db_json = {
123+ "Domain" :"" ,
124+ "ImageFileName" :"" ,
125+ "ImageFileID" :"" ,
126+ "HTMLText" :"" ,
127+ "QAPair" :{},
128+ "Evaluation_Result" :{}}
129+ return db_json
130+
131+
132+ def database_data_insert (PASSWORD ):
133+ service = get_drive_service ()
134+ START_FOLDER_ID = ""
135+ folders = child_folders (service , START_FOLDER_ID )
136+
137+ domains = {}
138+ for folder in folders :
139+ domains [f'{ folder ["name" ]} ' ] = folder ["id" ]
140+
141+
142+ for domain in domains .keys ():
143+ collection = mongo_client (PASSWORD , domain )
144+ domain_folder = child_folders (service , domains [f"{ domain } " ])
145+
146+ sub_folders = {}
147+ for folder in domain_folder :
148+ sub_folders [f"{ folder ["name" ]} " ] = folder ['id' ]
149+
150+ table_folders = child_folders (service , sub_folders ["Table" ])
151+
152+ for folder in table_folders :
153+ folder_name = folder ["name" ]
154+ folder_id = folder ["id" ]
155+ # print(f"\n폴더 '{folder_name}' (ID: {folder_id}")
156+
157+ files = list_all_files_in_folder (service , folder_id )
158+ # print(f" - 파일 개수: {len(files)}")
159+
160+ # 각 파일 읽기 및 Database 적재
161+ for file in files :
162+ file_name = file ["name" ]
163+ file_id = file ["id" ]
164+ file_mime = file .get ("mimeType" , "unknown" )
165+
166+ file_json = table_json_format ()
167+ file_json ['Domain' ] = domain
168+ file_json ['ImageFileName' ] = file_name
169+ file_json ["ImageFileID" ] = file_id
170+ print (file_json )
171+ collection .insert_one (file_json )
172+
173+
174+ if __name__ == '__main__' :
175+ # create_csv()
176+ PASSWORD = ""
177+ database_data_insert (PASSWORD )
178+
0 commit comments