|
29 | 29 | }, |
30 | 30 | { |
31 | 31 | "cell_type": "code", |
32 | | - "execution_count": 2, |
| 32 | + "execution_count": 1, |
33 | 33 | "metadata": {}, |
34 | 34 | "outputs": [], |
35 | 35 | "source": [ |
|
49 | 49 | }, |
50 | 50 | { |
51 | 51 | "cell_type": "code", |
52 | | - "execution_count": 5, |
| 52 | + "execution_count": 2, |
53 | 53 | "metadata": {}, |
54 | 54 | "outputs": [ |
55 | 55 | { |
|
76 | 76 | }, |
77 | 77 | { |
78 | 78 | "cell_type": "code", |
79 | | - "execution_count": 6, |
| 79 | + "execution_count": 3, |
80 | 80 | "metadata": {}, |
81 | 81 | "outputs": [ |
82 | 82 | { |
83 | 83 | "name": "stdout", |
84 | 84 | "output_type": "stream", |
85 | 85 | "text": [ |
86 | | - "Time to make var_names unique: 20.11086130142212\n", |
87 | | - "Time to create X table schema: 0.19924283027648926\n", |
88 | | - "Time to insert X data: 9.001039266586304\n" |
| 86 | + "Time to make var_names unique: 23.616740942001343\n", |
| 87 | + "Time to create X table structure: 0.24507379531860352\n", |
| 88 | + "Starting backed mode X table data insert. Total rows: 2700\n", |
| 89 | + "Processed chunk 0-2699 in 4.261802673339844 seconds\n", |
| 90 | + "\n", |
| 91 | + "Too close for missiles, switching to guns\n", |
| 92 | + "Creating X table from buffer file.\n", |
| 93 | + "This may take a while...\n", |
| 94 | + "Time to create X table from buffer: 67.55312919616699\n", |
| 95 | + "Finished inserting X data.\n" |
89 | 96 | ] |
90 | 97 | }, |
91 | 98 | { |
92 | 99 | "data": { |
93 | 100 | "text/plain": [ |
94 | | - "<MakeDb.MakeDb at 0x700fa0bc66c0>" |
| 101 | + "<AnnSQL.MakeDb.MakeDb at 0x7e32c02339b0>" |
95 | 102 | ] |
96 | 103 | }, |
97 | | - "execution_count": 6, |
| 104 | + "execution_count": 3, |
98 | 105 | "metadata": {}, |
99 | 106 | "output_type": "execute_result" |
100 | 107 | } |
101 | 108 | ], |
102 | 109 | "source": [ |
103 | | - "#this delete command is for testing purposes only. Remove this line in production \n", |
| 110 | + "adata = sc.read_h5ad(\"data/pbmc3k_raw.h5ad\", backed=\"r\")\n", |
| 111 | + "\n", |
| 112 | + "#this delete command is for testing purposes only. \n", |
104 | 113 | "if os.path.exists(\"db/pbmc3k.asql\"):\n", |
105 | 114 | "\tos.remove(\"db/pbmc3k.asql\")\n", |
| 115 | + "if os.path.exists(\"db/pbmc3k.asql.wal\"):\n", |
| 116 | + "\tos.remove(\"db/pbmc3k.asql.wal\")\n", |
| 117 | + "\n", |
| 118 | + "#high system memory (>24Gb)\n", |
| 119 | + "MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\", chunk_size=5000)\n", |
| 120 | + "\n", |
| 121 | + "# #medium system memory (12-24Gb)\n", |
| 122 | + "# MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\", chunk_size=2500)\n", |
106 | 123 | "\n", |
107 | | - "MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\")" |
| 124 | + "# #low system memory (<12Gb)\n", |
| 125 | + "# MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\", chunk_size=1000, make_buffer_file=True)" |
108 | 126 | ] |
109 | 127 | }, |
110 | 128 | { |
|
117 | 135 | }, |
118 | 136 | { |
119 | 137 | "cell_type": "code", |
120 | | - "execution_count": 9, |
| 138 | + "execution_count": 4, |
121 | 139 | "metadata": {}, |
122 | 140 | "outputs": [], |
123 | 141 | "source": [ |
|
133 | 151 | }, |
134 | 152 | { |
135 | 153 | "cell_type": "code", |
136 | | - "execution_count": 10, |
| 154 | + "execution_count": 5, |
137 | 155 | "metadata": {}, |
138 | 156 | "outputs": [ |
139 | 157 | { |
|
338 | 356 | "[5 rows x 32739 columns]" |
339 | 357 | ] |
340 | 358 | }, |
341 | | - "execution_count": 10, |
| 359 | + "execution_count": 5, |
342 | 360 | "metadata": {}, |
343 | 361 | "output_type": "execute_result" |
344 | 362 | } |
|
458 | 476 | "#total counts per gene \n", |
459 | 477 | "adata_sql.query(\"SELECT SUM(COLUMNS(*)) FROM (SELECT * EXCLUDE (cell_id) FROM X)\")" |
460 | 478 | ] |
461 | | - }, |
462 | | - { |
463 | | - "cell_type": "markdown", |
464 | | - "metadata": {}, |
465 | | - "source": [ |
466 | | - "### Normalize to 10k reads per library and log transform\n", |
467 | | - "Below, we illustrate how to do basic normalization and log transformations using AnnSQL. It's worth nothing that for smaller datasets, there are no runtime benefits over using AnnSQL. For larger datasets that are >50k cells; this method becomes more computationally feasible on a resource limited computer. " |
468 | | - ] |
469 | | - }, |
470 | | - { |
471 | | - "cell_type": "code", |
472 | | - "execution_count": 7, |
473 | | - "metadata": {}, |
474 | | - "outputs": [ |
475 | | - { |
476 | | - "name": "stdout", |
477 | | - "output_type": "stream", |
478 | | - "text": [ |
479 | | - "Total counts column added\n", |
480 | | - "Total counts added\n", |
481 | | - "Normalized to 10k and log2\n" |
482 | | - ] |
483 | | - }, |
484 | | - { |
485 | | - "data": { |
486 | | - "text/html": [ |
487 | | - "<div>\n", |
488 | | - "<style scoped>\n", |
489 | | - " .dataframe tbody tr th:only-of-type {\n", |
490 | | - " vertical-align: middle;\n", |
491 | | - " }\n", |
492 | | - "\n", |
493 | | - " .dataframe tbody tr th {\n", |
494 | | - " vertical-align: top;\n", |
495 | | - " }\n", |
496 | | - "\n", |
497 | | - " .dataframe thead th {\n", |
498 | | - " text-align: right;\n", |
499 | | - " }\n", |
500 | | - "</style>\n", |
501 | | - "<table border=\"1\" class=\"dataframe\">\n", |
502 | | - " <thead>\n", |
503 | | - " <tr style=\"text-align: right;\">\n", |
504 | | - " <th></th>\n", |
505 | | - " <th>cell_id</th>\n", |
506 | | - " <th>MIR1302_10</th>\n", |
507 | | - " <th>FAM138A</th>\n", |
508 | | - " <th>OR4F5</th>\n", |
509 | | - " <th>RP11_34P13_7</th>\n", |
510 | | - " <th>RP11_34P13_8</th>\n", |
511 | | - " <th>AL627309_1</th>\n", |
512 | | - " <th>RP11_34P13_14</th>\n", |
513 | | - " <th>RP11_34P13_9</th>\n", |
514 | | - " <th>AP006222_2</th>\n", |
515 | | - " <th>...</th>\n", |
516 | | - " <th>AL590523_1</th>\n", |
517 | | - " <th>CT476828_1</th>\n", |
518 | | - " <th>PNRC2_1</th>\n", |
519 | | - " <th>SRSF10_1</th>\n", |
520 | | - " <th>AC145205_1</th>\n", |
521 | | - " <th>BAGE5</th>\n", |
522 | | - " <th>CU459201_1</th>\n", |
523 | | - " <th>AC002321_2</th>\n", |
524 | | - " <th>AC002321_1</th>\n", |
525 | | - " <th>total_counts</th>\n", |
526 | | - " </tr>\n", |
527 | | - " </thead>\n", |
528 | | - " <tbody>\n", |
529 | | - " <tr>\n", |
530 | | - " <th>0</th>\n", |
531 | | - " <td>AAACATACAACCAC-1</td>\n", |
532 | | - " <td>-16.60964</td>\n", |
533 | | - " <td>-16.60964</td>\n", |
534 | | - " <td>-16.60964</td>\n", |
535 | | - " <td>-16.60964</td>\n", |
536 | | - " <td>-16.60964</td>\n", |
537 | | - " <td>-16.60964</td>\n", |
538 | | - " <td>-16.60964</td>\n", |
539 | | - " <td>-16.60964</td>\n", |
540 | | - " <td>-16.60964</td>\n", |
541 | | - " <td>...</td>\n", |
542 | | - " <td>-16.60964</td>\n", |
543 | | - " <td>-16.60964</td>\n", |
544 | | - " <td>-16.60964</td>\n", |
545 | | - " <td>-16.60964</td>\n", |
546 | | - " <td>-16.60964</td>\n", |
547 | | - " <td>-16.60964</td>\n", |
548 | | - " <td>-16.60964</td>\n", |
549 | | - " <td>-16.60964</td>\n", |
550 | | - " <td>-16.60964</td>\n", |
551 | | - " <td>2421.0</td>\n", |
552 | | - " </tr>\n", |
553 | | - " <tr>\n", |
554 | | - " <th>1</th>\n", |
555 | | - " <td>AAACATTGAGCTAC-1</td>\n", |
556 | | - " <td>-16.60964</td>\n", |
557 | | - " <td>-16.60964</td>\n", |
558 | | - " <td>-16.60964</td>\n", |
559 | | - " <td>-16.60964</td>\n", |
560 | | - " <td>-16.60964</td>\n", |
561 | | - " <td>-16.60964</td>\n", |
562 | | - " <td>-16.60964</td>\n", |
563 | | - " <td>-16.60964</td>\n", |
564 | | - " <td>-16.60964</td>\n", |
565 | | - " <td>...</td>\n", |
566 | | - " <td>-16.60964</td>\n", |
567 | | - " <td>-16.60964</td>\n", |
568 | | - " <td>-16.60964</td>\n", |
569 | | - " <td>-16.60964</td>\n", |
570 | | - " <td>-16.60964</td>\n", |
571 | | - " <td>-16.60964</td>\n", |
572 | | - " <td>-16.60964</td>\n", |
573 | | - " <td>-16.60964</td>\n", |
574 | | - " <td>-16.60964</td>\n", |
575 | | - " <td>4903.0</td>\n", |
576 | | - " </tr>\n", |
577 | | - " <tr>\n", |
578 | | - " <th>2</th>\n", |
579 | | - " <td>AAACATTGATCAGC-1</td>\n", |
580 | | - " <td>-16.60964</td>\n", |
581 | | - " <td>-16.60964</td>\n", |
582 | | - " <td>-16.60964</td>\n", |
583 | | - " <td>-16.60964</td>\n", |
584 | | - " <td>-16.60964</td>\n", |
585 | | - " <td>-16.60964</td>\n", |
586 | | - " <td>-16.60964</td>\n", |
587 | | - " <td>-16.60964</td>\n", |
588 | | - " <td>-16.60964</td>\n", |
589 | | - " <td>...</td>\n", |
590 | | - " <td>-16.60964</td>\n", |
591 | | - " <td>-16.60964</td>\n", |
592 | | - " <td>-16.60964</td>\n", |
593 | | - " <td>-16.60964</td>\n", |
594 | | - " <td>-16.60964</td>\n", |
595 | | - " <td>-16.60964</td>\n", |
596 | | - " <td>-16.60964</td>\n", |
597 | | - " <td>-16.60964</td>\n", |
598 | | - " <td>-16.60964</td>\n", |
599 | | - " <td>3149.0</td>\n", |
600 | | - " </tr>\n", |
601 | | - " <tr>\n", |
602 | | - " <th>3</th>\n", |
603 | | - " <td>AAACCGTGCTTCCG-1</td>\n", |
604 | | - " <td>-16.60964</td>\n", |
605 | | - " <td>-16.60964</td>\n", |
606 | | - " <td>-16.60964</td>\n", |
607 | | - " <td>-16.60964</td>\n", |
608 | | - " <td>-16.60964</td>\n", |
609 | | - " <td>-16.60964</td>\n", |
610 | | - " <td>-16.60964</td>\n", |
611 | | - " <td>-16.60964</td>\n", |
612 | | - " <td>-16.60964</td>\n", |
613 | | - " <td>...</td>\n", |
614 | | - " <td>-16.60964</td>\n", |
615 | | - " <td>-16.60964</td>\n", |
616 | | - " <td>-16.60964</td>\n", |
617 | | - " <td>-16.60964</td>\n", |
618 | | - " <td>-16.60964</td>\n", |
619 | | - " <td>-16.60964</td>\n", |
620 | | - " <td>-16.60964</td>\n", |
621 | | - " <td>-16.60964</td>\n", |
622 | | - " <td>-16.60964</td>\n", |
623 | | - " <td>2639.0</td>\n", |
624 | | - " </tr>\n", |
625 | | - " <tr>\n", |
626 | | - " <th>4</th>\n", |
627 | | - " <td>AAACCGTGTATGCG-1</td>\n", |
628 | | - " <td>-16.60964</td>\n", |
629 | | - " <td>-16.60964</td>\n", |
630 | | - " <td>-16.60964</td>\n", |
631 | | - " <td>-16.60964</td>\n", |
632 | | - " <td>-16.60964</td>\n", |
633 | | - " <td>-16.60964</td>\n", |
634 | | - " <td>-16.60964</td>\n", |
635 | | - " <td>-16.60964</td>\n", |
636 | | - " <td>-16.60964</td>\n", |
637 | | - " <td>...</td>\n", |
638 | | - " <td>-16.60964</td>\n", |
639 | | - " <td>-16.60964</td>\n", |
640 | | - " <td>-16.60964</td>\n", |
641 | | - " <td>-16.60964</td>\n", |
642 | | - " <td>-16.60964</td>\n", |
643 | | - " <td>-16.60964</td>\n", |
644 | | - " <td>-16.60964</td>\n", |
645 | | - " <td>-16.60964</td>\n", |
646 | | - " <td>-16.60964</td>\n", |
647 | | - " <td>981.0</td>\n", |
648 | | - " </tr>\n", |
649 | | - " </tbody>\n", |
650 | | - "</table>\n", |
651 | | - "<p>5 rows × 32740 columns</p>\n", |
652 | | - "</div>" |
653 | | - ], |
654 | | - "text/plain": [ |
655 | | - " cell_id MIR1302_10 FAM138A OR4F5 RP11_34P13_7 \\\n", |
656 | | - "0 AAACATACAACCAC-1 -16.60964 -16.60964 -16.60964 -16.60964 \n", |
657 | | - "1 AAACATTGAGCTAC-1 -16.60964 -16.60964 -16.60964 -16.60964 \n", |
658 | | - "2 AAACATTGATCAGC-1 -16.60964 -16.60964 -16.60964 -16.60964 \n", |
659 | | - "3 AAACCGTGCTTCCG-1 -16.60964 -16.60964 -16.60964 -16.60964 \n", |
660 | | - "4 AAACCGTGTATGCG-1 -16.60964 -16.60964 -16.60964 -16.60964 \n", |
661 | | - "\n", |
662 | | - " RP11_34P13_8 AL627309_1 RP11_34P13_14 RP11_34P13_9 AP006222_2 ... \\\n", |
663 | | - "0 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 ... \n", |
664 | | - "1 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 ... \n", |
665 | | - "2 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 ... \n", |
666 | | - "3 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 ... \n", |
667 | | - "4 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 ... \n", |
668 | | - "\n", |
669 | | - " AL590523_1 CT476828_1 PNRC2_1 SRSF10_1 AC145205_1 BAGE5 \\\n", |
670 | | - "0 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 \n", |
671 | | - "1 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 \n", |
672 | | - "2 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 \n", |
673 | | - "3 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 \n", |
674 | | - "4 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 -16.60964 \n", |
675 | | - "\n", |
676 | | - " CU459201_1 AC002321_2 AC002321_1 total_counts \n", |
677 | | - "0 -16.60964 -16.60964 -16.60964 2421.0 \n", |
678 | | - "1 -16.60964 -16.60964 -16.60964 4903.0 \n", |
679 | | - "2 -16.60964 -16.60964 -16.60964 3149.0 \n", |
680 | | - "3 -16.60964 -16.60964 -16.60964 2639.0 \n", |
681 | | - "4 -16.60964 -16.60964 -16.60964 981.0 \n", |
682 | | - "\n", |
683 | | - "[5 rows x 32740 columns]" |
684 | | - ] |
685 | | - }, |
686 | | - "execution_count": 7, |
687 | | - "metadata": {}, |
688 | | - "output_type": "execute_result" |
689 | | - } |
690 | | - ], |
691 | | - "source": [ |
692 | | - "#get all gene names \n", |
693 | | - "gene_names = adata_sql.query(f\"Describe X\")['column_name'][1:].values\n", |
694 | | - "\n", |
695 | | - "#add a total counts column\n", |
696 | | - "adata_sql.query(f\"ALTER TABLE X ADD COLUMN total_counts FLOAT DEFAULT 0;\")\n", |
697 | | - "print(\"Total counts column added\")\n", |
698 | | - "\n", |
699 | | - "#iterates gene_names in chunks\n", |
700 | | - "chunk_size = 990 #Ddb limited to 1k\n", |
701 | | - "for i in range(0, len(gene_names), chunk_size):\n", |
702 | | - "\tchunk = gene_names[i:i+chunk_size]\n", |
703 | | - "\tchunk = \" + \".join(chunk) + \" + total_counts\"\n", |
704 | | - "\tadata_sql.update_query(f\"UPDATE X SET total_counts = ({chunk});\", suppress_message=True)\n", |
705 | | - "print(\"Total counts added\")\n", |
706 | | - "\n", |
707 | | - "#normalize to 10k and log2\n", |
708 | | - "chunk_size = 200 #reduces db memory usage\n", |
709 | | - "for i in range(0, len(gene_names), chunk_size):\n", |
710 | | - "\tupdates = []\n", |
711 | | - "\tchunk = gene_names[i:i + chunk_size]\n", |
712 | | - "\tfor gene in chunk:\n", |
713 | | - "\t\tupdates.append(f\"{gene} = LOG2(({gene} / total_counts) * 1e4 + 1e-5)\")\n", |
714 | | - "\tupdate_query = f\"UPDATE X SET {', '.join(updates)}\"\n", |
715 | | - "\tadata_sql.update_query(update_query, suppress_message=True)\n", |
716 | | - "print(\"Normalized to 10k and log2\")\n", |
717 | | - "\n", |
718 | | - "#show the first 5 rows\n", |
719 | | - "adata_sql.query(\"SELECT * FROM X LIMIT 5\")" |
720 | | - ] |
721 | 479 | } |
722 | 480 | ], |
723 | 481 | "metadata": { |
|
0 commit comments