Skip to content

Commit c66fb84

Browse files
committed
DuckDb chunked insert memory leak workaround implemented with make_buffer_file parameter
1 parent 9ff2e68 commit c66fb84

File tree

6 files changed

+109
-289
lines changed

6 files changed

+109
-289
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ adata_sql.query("SELECT corr(ITGB2,SSU72) as correlation FROM adata WHERE bulk_l
222222
<li><code>db_name</code>: Name for the database (required)</li>
223223
<li><code>db_path</code>: Path to store the database (default: 'db/')</li>
224224
<li><code>chunk_size</code>: When opening AnnData in backed mode, the amount of chunks to insert at one time. Lower the value for low-memory systems (default: 5000)</li>
225+
<li><code>make_buffer_file</code>: For memory errors on very low memory system, set this flag to true. It will mitigate the DuckDb high memory leak while inserting chunks by creating a buffer file. Be sure to have hard drive space x2 the size of your AnnData object available. (default: False)</li></li>
225226
<li><code>layers</code>: List (optional. default: ["X", "obs", "var", "var_names", "obsm", "varm", "obsp", "uns"]).<i>The layers of the Anndata object to build into the database. For larger datasets, it may be beneficial to only include the layers you're interested in querying.</i></li>
226227
<li><code>create_basic_indexes</code>: Build indexed on cell_id (optional. default: False)</li>
227228
<li><code>create_all_indexes</code>: Boolean (optional. default: False). <i>Warning: Runtime can be significant when building.</i></li>

examples/build_ondisk_database.ipynb

Lines changed: 31 additions & 273 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
},
3030
{
3131
"cell_type": "code",
32-
"execution_count": 2,
32+
"execution_count": 1,
3333
"metadata": {},
3434
"outputs": [],
3535
"source": [
@@ -49,7 +49,7 @@
4949
},
5050
{
5151
"cell_type": "code",
52-
"execution_count": 5,
52+
"execution_count": 2,
5353
"metadata": {},
5454
"outputs": [
5555
{
@@ -76,35 +76,53 @@
7676
},
7777
{
7878
"cell_type": "code",
79-
"execution_count": 6,
79+
"execution_count": 3,
8080
"metadata": {},
8181
"outputs": [
8282
{
8383
"name": "stdout",
8484
"output_type": "stream",
8585
"text": [
86-
"Time to make var_names unique: 20.11086130142212\n",
87-
"Time to create X table schema: 0.19924283027648926\n",
88-
"Time to insert X data: 9.001039266586304\n"
86+
"Time to make var_names unique: 23.616740942001343\n",
87+
"Time to create X table structure: 0.24507379531860352\n",
88+
"Starting backed mode X table data insert. Total rows: 2700\n",
89+
"Processed chunk 0-2699 in 4.261802673339844 seconds\n",
90+
"\n",
91+
"Too close for missiles, switching to guns\n",
92+
"Creating X table from buffer file.\n",
93+
"This may take a while...\n",
94+
"Time to create X table from buffer: 67.55312919616699\n",
95+
"Finished inserting X data.\n"
8996
]
9097
},
9198
{
9299
"data": {
93100
"text/plain": [
94-
"<MakeDb.MakeDb at 0x700fa0bc66c0>"
101+
"<AnnSQL.MakeDb.MakeDb at 0x7e32c02339b0>"
95102
]
96103
},
97-
"execution_count": 6,
104+
"execution_count": 3,
98105
"metadata": {},
99106
"output_type": "execute_result"
100107
}
101108
],
102109
"source": [
103-
"#this delete command is for testing purposes only. Remove this line in production \n",
110+
"adata = sc.read_h5ad(\"data/pbmc3k_raw.h5ad\", backed=\"r\")\n",
111+
"\n",
112+
"#this delete command is for testing purposes only. \n",
104113
"if os.path.exists(\"db/pbmc3k.asql\"):\n",
105114
"\tos.remove(\"db/pbmc3k.asql\")\n",
115+
"if os.path.exists(\"db/pbmc3k.asql.wal\"):\n",
116+
"\tos.remove(\"db/pbmc3k.asql.wal\")\n",
117+
"\n",
118+
"#high system memory (>24Gb)\n",
119+
"MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\", chunk_size=5000)\n",
120+
"\n",
121+
"# #medium system memory (12-24Gb)\n",
122+
"# MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\", chunk_size=2500)\n",
106123
"\n",
107-
"MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\")"
124+
"# #low system memory (<12Gb)\n",
125+
"# MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\", chunk_size=1000, make_buffer_file=True)"
108126
]
109127
},
110128
{
@@ -117,7 +135,7 @@
117135
},
118136
{
119137
"cell_type": "code",
120-
"execution_count": 9,
138+
"execution_count": 4,
121139
"metadata": {},
122140
"outputs": [],
123141
"source": [
@@ -133,7 +151,7 @@
133151
},
134152
{
135153
"cell_type": "code",
136-
"execution_count": 10,
154+
"execution_count": 5,
137155
"metadata": {},
138156
"outputs": [
139157
{
@@ -338,7 +356,7 @@
338356
"[5 rows x 32739 columns]"
339357
]
340358
},
341-
"execution_count": 10,
359+
"execution_count": 5,
342360
"metadata": {},
343361
"output_type": "execute_result"
344362
}
@@ -458,266 +476,6 @@
458476
"#total counts per gene \n",
459477
"adata_sql.query(\"SELECT SUM(COLUMNS(*)) FROM (SELECT * EXCLUDE (cell_id) FROM X)\")"
460478
]
461-
},
462-
{
463-
"cell_type": "markdown",
464-
"metadata": {},
465-
"source": [
466-
"### Normalize to 10k reads per library and log transform\n",
467-
"Below, we illustrate how to do basic normalization and log transformations using AnnSQL. It's worth nothing that for smaller datasets, there are no runtime benefits over using AnnSQL. For larger datasets that are >50k cells; this method becomes more computationally feasible on a resource limited computer. "
468-
]
469-
},
470-
{
471-
"cell_type": "code",
472-
"execution_count": 7,
473-
"metadata": {},
474-
"outputs": [
475-
{
476-
"name": "stdout",
477-
"output_type": "stream",
478-
"text": [
479-
"Total counts column added\n",
480-
"Total counts added\n",
481-
"Normalized to 10k and log2\n"
482-
]
483-
},
484-
{
485-
"data": {
486-
"text/html": [
487-
"<div>\n",
488-
"<style scoped>\n",
489-
" .dataframe tbody tr th:only-of-type {\n",
490-
" vertical-align: middle;\n",
491-
" }\n",
492-
"\n",
493-
" .dataframe tbody tr th {\n",
494-
" vertical-align: top;\n",
495-
" }\n",
496-
"\n",
497-
" .dataframe thead th {\n",
498-
" text-align: right;\n",
499-
" }\n",
500-
"</style>\n",
501-
"<table border=\"1\" class=\"dataframe\">\n",
502-
" <thead>\n",
503-
" <tr style=\"text-align: right;\">\n",
504-
" <th></th>\n",
505-
" <th>cell_id</th>\n",
506-
" <th>MIR1302_10</th>\n",
507-
" <th>FAM138A</th>\n",
508-
" <th>OR4F5</th>\n",
509-
" <th>RP11_34P13_7</th>\n",
510-
" <th>RP11_34P13_8</th>\n",
511-
" <th>AL627309_1</th>\n",
512-
" <th>RP11_34P13_14</th>\n",
513-
" <th>RP11_34P13_9</th>\n",
514-
" <th>AP006222_2</th>\n",
515-
" <th>...</th>\n",
516-
" <th>AL590523_1</th>\n",
517-
" <th>CT476828_1</th>\n",
518-
" <th>PNRC2_1</th>\n",
519-
" <th>SRSF10_1</th>\n",
520-
" <th>AC145205_1</th>\n",
521-
" <th>BAGE5</th>\n",
522-
" <th>CU459201_1</th>\n",
523-
" <th>AC002321_2</th>\n",
524-
" <th>AC002321_1</th>\n",
525-
" <th>total_counts</th>\n",
526-
" </tr>\n",
527-
" </thead>\n",
528-
" <tbody>\n",
529-
" <tr>\n",
530-
" <th>0</th>\n",
531-
" <td>AAACATACAACCAC-1</td>\n",
532-
" <td>-16.60964</td>\n",
533-
" <td>-16.60964</td>\n",
534-
" <td>-16.60964</td>\n",
535-
" <td>-16.60964</td>\n",
536-
" <td>-16.60964</td>\n",
537-
" <td>-16.60964</td>\n",
538-
" <td>-16.60964</td>\n",
539-
" <td>-16.60964</td>\n",
540-
" <td>-16.60964</td>\n",
541-
" <td>...</td>\n",
542-
" <td>-16.60964</td>\n",
543-
" <td>-16.60964</td>\n",
544-
" <td>-16.60964</td>\n",
545-
" <td>-16.60964</td>\n",
546-
" <td>-16.60964</td>\n",
547-
" <td>-16.60964</td>\n",
548-
" <td>-16.60964</td>\n",
549-
" <td>-16.60964</td>\n",
550-
" <td>-16.60964</td>\n",
551-
" <td>2421.0</td>\n",
552-
" </tr>\n",
553-
" <tr>\n",
554-
" <th>1</th>\n",
555-
" <td>AAACATTGAGCTAC-1</td>\n",
556-
" <td>-16.60964</td>\n",
557-
" <td>-16.60964</td>\n",
558-
" <td>-16.60964</td>\n",
559-
" <td>-16.60964</td>\n",
560-
" <td>-16.60964</td>\n",
561-
" <td>-16.60964</td>\n",
562-
" <td>-16.60964</td>\n",
563-
" <td>-16.60964</td>\n",
564-
" <td>-16.60964</td>\n",
565-
" <td>...</td>\n",
566-
" <td>-16.60964</td>\n",
567-
" <td>-16.60964</td>\n",
568-
" <td>-16.60964</td>\n",
569-
" <td>-16.60964</td>\n",
570-
" <td>-16.60964</td>\n",
571-
" <td>-16.60964</td>\n",
572-
" <td>-16.60964</td>\n",
573-
" <td>-16.60964</td>\n",
574-
" <td>-16.60964</td>\n",
575-
" <td>4903.0</td>\n",
576-
" </tr>\n",
577-
" <tr>\n",
578-
" <th>2</th>\n",
579-
" <td>AAACATTGATCAGC-1</td>\n",
580-
" <td>-16.60964</td>\n",
581-
" <td>-16.60964</td>\n",
582-
" <td>-16.60964</td>\n",
583-
" <td>-16.60964</td>\n",
584-
" <td>-16.60964</td>\n",
585-
" <td>-16.60964</td>\n",
586-
" <td>-16.60964</td>\n",
587-
" <td>-16.60964</td>\n",
588-
" <td>-16.60964</td>\n",
589-
" <td>...</td>\n",
590-
" <td>-16.60964</td>\n",
591-
" <td>-16.60964</td>\n",
592-
" <td>-16.60964</td>\n",
593-
" <td>-16.60964</td>\n",
594-
" <td>-16.60964</td>\n",
595-
" <td>-16.60964</td>\n",
596-
" <td>-16.60964</td>\n",
597-
" <td>-16.60964</td>\n",
598-
" <td>-16.60964</td>\n",
599-
" <td>3149.0</td>\n",
600-
" </tr>\n",
601-
" <tr>\n",
602-
" <th>3</th>\n",
603-
" <td>AAACCGTGCTTCCG-1</td>\n",
604-
" <td>-16.60964</td>\n",
605-
" <td>-16.60964</td>\n",
606-
" <td>-16.60964</td>\n",
607-
" <td>-16.60964</td>\n",
608-
" <td>-16.60964</td>\n",
609-
" <td>-16.60964</td>\n",
610-
" <td>-16.60964</td>\n",
611-
" <td>-16.60964</td>\n",
612-
" <td>-16.60964</td>\n",
613-
" <td>...</td>\n",
614-
" <td>-16.60964</td>\n",
615-
" <td>-16.60964</td>\n",
616-
" <td>-16.60964</td>\n",
617-
" <td>-16.60964</td>\n",
618-
" <td>-16.60964</td>\n",
619-
" <td>-16.60964</td>\n",
620-
" <td>-16.60964</td>\n",
621-
" <td>-16.60964</td>\n",
622-
" <td>-16.60964</td>\n",
623-
" <td>2639.0</td>\n",
624-
" </tr>\n",
625-
" <tr>\n",
626-
" <th>4</th>\n",
627-
" <td>AAACCGTGTATGCG-1</td>\n",
628-
" <td>-16.60964</td>\n",
629-
" <td>-16.60964</td>\n",
630-
" <td>-16.60964</td>\n",
631-
" <td>-16.60964</td>\n",
632-
" <td>-16.60964</td>\n",
633-
" <td>-16.60964</td>\n",
634-
" <td>-16.60964</td>\n",
635-
" <td>-16.60964</td>\n",
636-
" <td>-16.60964</td>\n",
637-
" <td>...</td>\n",
638-
" <td>-16.60964</td>\n",
639-
" <td>-16.60964</td>\n",
640-
" <td>-16.60964</td>\n",
641-
" <td>-16.60964</td>\n",
642-
" <td>-16.60964</td>\n",
643-
" <td>-16.60964</td>\n",
644-
" <td>-16.60964</td>\n",
645-
" <td>-16.60964</td>\n",
646-
" <td>-16.60964</td>\n",
647-
" <td>981.0</td>\n",
648-
" </tr>\n",
649-
" </tbody>\n",
650-
"</table>\n",
651-
"<p>5 rows × 32740 columns</p>\n",
652-
"</div>"
653-
],
654-
"text/plain": [
655-
" cell_id MIR1302_10 FAM138A OR4F5 RP11_34P13_7 \\\n",
656-
"0 AAACATACAACCAC-1 -16.60964 -16.60964 -16.60964 -16.60964 \n",
657-
"1 AAACATTGAGCTAC-1 -16.60964 -16.60964 -16.60964 -16.60964 \n",
658-
"2 AAACATTGATCAGC-1 -16.60964 -16.60964 -16.60964 -16.60964 \n",
659-
"3 AAACCGTGCTTCCG-1 -16.60964 -16.60964 -16.60964 -16.60964 \n",
660-
"4 AAACCGTGTATGCG-1 -16.60964 -16.60964 -16.60964 -16.60964 \n",
661-
"\n",
662-
" RP11_34P13_8 AL627309_1 RP11_34P13_14 RP11_34P13_9 AP006222_2 ... \\\n",
663-
"0 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 ... \n",
664-
"1 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 ... \n",
665-
"2 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 ... \n",
666-
"3 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 ... \n",
667-
"4 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 ... \n",
668-
"\n",
669-
" AL590523_1 CT476828_1 PNRC2_1 SRSF10_1 AC145205_1 BAGE5 \\\n",
670-
"0 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 \n",
671-
"1 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 \n",
672-
"2 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 \n",
673-
"3 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 \n",
674-
"4 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 \n",
675-
"\n",
676-
" CU459201_1 AC002321_2 AC002321_1 total_counts \n",
677-
"0 -16.60964 -16.60964 -16.60964 2421.0 \n",
678-
"1 -16.60964 -16.60964 -16.60964 4903.0 \n",
679-
"2 -16.60964 -16.60964 -16.60964 3149.0 \n",
680-
"3 -16.60964 -16.60964 -16.60964 2639.0 \n",
681-
"4 -16.60964 -16.60964 -16.60964 981.0 \n",
682-
"\n",
683-
"[5 rows x 32740 columns]"
684-
]
685-
},
686-
"execution_count": 7,
687-
"metadata": {},
688-
"output_type": "execute_result"
689-
}
690-
],
691-
"source": [
692-
"#get all gene names \n",
693-
"gene_names = adata_sql.query(f\"Describe X\")['column_name'][1:].values\n",
694-
"\n",
695-
"#add a total counts column\n",
696-
"adata_sql.query(f\"ALTER TABLE X ADD COLUMN total_counts FLOAT DEFAULT 0;\")\n",
697-
"print(\"Total counts column added\")\n",
698-
"\n",
699-
"#iterates gene_names in chunks\n",
700-
"chunk_size = 990 #Ddb limited to 1k\n",
701-
"for i in range(0, len(gene_names), chunk_size):\n",
702-
"\tchunk = gene_names[i:i+chunk_size]\n",
703-
"\tchunk = \" + \".join(chunk) + \" + total_counts\"\n",
704-
"\tadata_sql.update_query(f\"UPDATE X SET total_counts = ({chunk});\", suppress_message=True)\n",
705-
"print(\"Total counts added\")\n",
706-
"\n",
707-
"#normalize to 10k and log2\n",
708-
"chunk_size = 200 #reduces db memory usage\n",
709-
"for i in range(0, len(gene_names), chunk_size):\n",
710-
"\tupdates = []\n",
711-
"\tchunk = gene_names[i:i + chunk_size]\n",
712-
"\tfor gene in chunk:\n",
713-
"\t\tupdates.append(f\"{gene} = LOG2(({gene} / total_counts) * 1e4 + 1e-5)\")\n",
714-
"\tupdate_query = f\"UPDATE X SET {', '.join(updates)}\"\n",
715-
"\tadata_sql.update_query(update_query, suppress_message=True)\n",
716-
"print(\"Normalized to 10k and log2\")\n",
717-
"\n",
718-
"#show the first 5 rows\n",
719-
"adata_sql.query(\"SELECT * FROM X LIMIT 5\")"
720-
]
721479
}
722480
],
723481
"metadata": {

src/AnnSQL.egg-info/PKG-INFO

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ adata_sql.query("SELECT corr(ITGB2,SSU72) as correlation FROM adata WHERE bulk_l
238238
<li><code>db_name</code>: Name for the database (required)</li>
239239
<li><code>db_path</code>: Path to store the database (default: 'db/')</li>
240240
<li><code>chunk_size</code>: When opening AnnData in backed mode, the amount of chunks to insert at one time. Lower the value for low-memory systems (default: 5000)</li>
241+
<li><code>make_buffer_file</code>: For memory errors on very low memory system, set this flag to true. It will mitigate the DuckDb high memory leak while inserting chunks by creating a buffer file. Be sure to have hard drive space x2 the size of your AnnData object available. (default: False)</li></li>
241242
<li><code>layers</code>: List (optional. default: ["X", "obs", "var", "var_names", "obsm", "varm", "obsp", "uns"]).<i>The layers of the Anndata object to build into the database. For larger datasets, it may be beneficial to only include the layers you're interested in querying.</i></li>
242243
<li><code>create_basic_indexes</code>: Build indexed on cell_id (optional. default: False)</li>
243244
<li><code>create_all_indexes</code>: Boolean (optional. default: False). <i>Warning: Runtime can be significant when building.</i></li>

0 commit comments

Comments
 (0)