Skip to content

Commit eabd526

Browse files
authored
Merge pull request #23 from Pseudo-Lab/feature/db_formatting
ADD : DriveRead / DB Storage
2 parents 0c445f2 + b761bd3 commit eabd526

File tree

3 files changed

+182
-62
lines changed

3 files changed

+182
-62
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ apis/gemini_keys.yaml
5454
.dmypy.json
5555
dmypy.json
5656

57+
# database
58+
info/
59+
token.json
60+
5761
# env
5862
.bemad/
5963
./docs/

db/curd.py

Lines changed: 0 additions & 62 deletions
This file was deleted.

db/data_handling.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
import pandas as pd
2+
from google.oauth2.credentials import Credentials
3+
from googleapiclient.discovery import build
4+
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
5+
from google_auth_oauthlib.flow import InstalledAppFlow
6+
from google.auth.transport.requests import Request
7+
import os
8+
import io
9+
import json
10+
from pymongo.mongo_client import MongoClient
11+
from tqdm import tqdm
12+
13+
14+
def get_drive_service():
15+
SCOPES = ['https://www.googleapis.com/auth/drive.readonly'] # Google Drive API 스코프 설정
16+
creds = None
17+
if os.path.exists('token.json'):
18+
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
19+
if not creds or not creds.valid:
20+
if creds and creds.expired and creds.refresh_token:
21+
creds.refresh(Request())
22+
else:
23+
flow = InstalledAppFlow.from_client_secrets_file(
24+
"/Users/seyeon/Desktop/데이터구축/TableMagnifier/info/client.json",
25+
SCOPES
26+
)
27+
creds = flow.run_local_server(
28+
host="localhost",
29+
port=8080,
30+
open_browser=True
31+
)
32+
33+
with open('token.json', 'w') as token:
34+
token.write(creds.to_json())
35+
return build('drive', 'v3', credentials=creds)
36+
37+
38+
def find_root_folder(service):
39+
root_folders = service.files().list(
40+
q="'root' in parents and mimeType='application/vnd.google-apps.folder' and trashed=false",
41+
fields="files(id,name)"
42+
).execute().get("files", [])
43+
44+
return root_folders
45+
46+
47+
def child_folders(service, folder_id, page_size=200):
48+
q = f"'{folder_id}' in parents and trashed = false"
49+
res = service.files().list(
50+
q=q,
51+
pageSize=page_size,
52+
fields="files(id,name,mimeType)"
53+
).execute()
54+
return res.get("files", [])
55+
56+
57+
def download_file_bytes(service, file_id):
58+
# fileID에 해당하는 이미지를 byte 형태로 read
59+
request = service.files().get_media(fileId=file_id)
60+
fh = io.BytesIO()
61+
downloader = MediaIoBaseDownload(fh, request)
62+
done = False
63+
while not done:
64+
status, done = downloader.next_chunk()
65+
fh.seek(0)
66+
return fh.read()
67+
68+
69+
def list_all_files_in_folder(service, folder_id):
70+
files = []
71+
page_token = None
72+
while True:
73+
res = service.files().list(
74+
q=(
75+
f"'{folder_id}' in parents and "
76+
"mimeType != 'application/vnd.google-apps.folder' and "
77+
"trashed = false"
78+
),
79+
fields="nextPageToken, files(id,name,mimeType)",
80+
pageToken=page_token,
81+
pageSize=1000
82+
).execute()
83+
84+
files.extend(res.get("files", []))
85+
page_token = res.get("nextPageToken")
86+
87+
if not page_token:
88+
break
89+
90+
return files
91+
92+
93+
def save_image_to_local(file_bytes, file_path):
94+
"""바이트 데이터를 이미지 파일로 저장"""
95+
with open(file_path, 'wb') as f:
96+
f.write(file_bytes)
97+
print(f"이미지 저장: {file_path}")
98+
99+
100+
def mongo_client(PASSWORD, collection_name):
101+
URI = f"mongodb+srv://TableMagnifier:{PASSWORD}@tablemagnifier.gf5mkkc.mongodb.net/?appName=TableMagnifier"
102+
mongo_client = MongoClient(
103+
URI,
104+
tls=True,
105+
tlsAllowInvalidCertificates=True
106+
)
107+
108+
# Database 선택
109+
db = mongo_client['TableInformation']
110+
# Collection 선택 : Academic, Business, Finance, Insurance, Medical, Public
111+
collection = db[f'{collection_name}']
112+
return collection
113+
114+
def get_file_information(file_id):
115+
file_info = service.files().get(
116+
fileId=file_id,
117+
fields="id,name,mimeType"
118+
).execute()
119+
return file_info
120+
121+
def table_json_format():
122+
db_json = {
123+
"Domain":"",
124+
"ImageFileName":"",
125+
"ImageFileID":"",
126+
"HTMLText":"",
127+
"QAPair":{},
128+
"Evaluation_Result":{}}
129+
return db_json
130+
131+
132+
def database_data_insert(PASSWORD):
133+
service = get_drive_service()
134+
START_FOLDER_ID = ""
135+
folders = child_folders(service, START_FOLDER_ID)
136+
137+
domains = {}
138+
for folder in folders:
139+
domains[f'{folder["name"]}'] = folder["id"]
140+
141+
142+
for domain in domains.keys():
143+
collection = mongo_client(PASSWORD, domain)
144+
domain_folder = child_folders(service, domains[f"{domain}"])
145+
146+
sub_folders = {}
147+
for folder in domain_folder:
148+
sub_folders[f"{folder["name"]}"] = folder['id']
149+
150+
table_folders = child_folders(service, sub_folders["Table"])
151+
152+
for folder in table_folders:
153+
folder_name = folder["name"]
154+
folder_id = folder["id"]
155+
# print(f"\n폴더 '{folder_name}' (ID: {folder_id}")
156+
157+
files = list_all_files_in_folder(service, folder_id)
158+
# print(f" - 파일 개수: {len(files)}")
159+
160+
# 각 파일 읽기 및 Database 적재
161+
for file in files:
162+
file_name = file["name"]
163+
file_id = file["id"]
164+
file_mime = file.get("mimeType", "unknown")
165+
166+
file_json = table_json_format()
167+
file_json['Domain'] = domain
168+
file_json['ImageFileName'] = file_name
169+
file_json["ImageFileID"] = file_id
170+
print(file_json)
171+
collection.insert_one(file_json)
172+
173+
174+
if __name__ == '__main__':
175+
# create_csv()
176+
PASSWORD = ""
177+
database_data_insert(PASSWORD)
178+

0 commit comments

Comments
 (0)