ArpiarSaundersLab
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/build_ondisk_database.ipynb‎
Lines changed: 31 additions & 273 deletions b/‎examples/build_ondisk_database.ipynb‎
Lines changed: 31 additions & 273 deletions
diff --git a/‎src/AnnSQL.egg-info/PKG-INFO‎
Lines changed: 1 addition & 0 deletions b/‎src/AnnSQL.egg-info/PKG-INFO‎
Lines changed: 1 addition & 0 deletions
@@ -222,6 +222,7 @@ adata_sql.query("SELECT corr(ITGB2,SSU72) as correlation FROM adata WHERE bulk_l
           <li><code>db_name</code>: Name for the database (required)</li>
           <li><code>db_path</code>: Path to store the database (default: 'db/')</li>
 		  <li><code>chunk_size</code>: When opening AnnData in backed mode, the amount of chunks to insert at one time. Lower the value for low-memory systems (default: 5000)</li>
+		  <li><code>make_buffer_file</code>: For memory errors on very low memory system, set this flag to true. It will mitigate the DuckDb high memory leak while inserting chunks by creating a buffer file. Be sure to have hard drive space x2 the size of your AnnData object available. (default: False)</li></li>
 		  <li><code>layers</code>: List (optional. default: ["X", "obs", "var", "var_names", "obsm", "varm", "obsp", "uns"]).<i>The layers of the Anndata object to build into the database. For larger datasets, it may be beneficial to only include the layers you're interested in querying.</i></li>
           <li><code>create_basic_indexes</code>: Build indexed on cell_id (optional. default: False)</li>
 		  <li><code>create_all_indexes</code>: Boolean (optional. default: False). <i>Warning: Runtime can be significant when building.</i></li>
 
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -49,7 +49,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -76,35 +76,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Time to make var_names unique:  20.11086130142212\n",
-      "Time to create X table schema:  0.19924283027648926\n",
-      "Time to insert X data:  9.001039266586304\n"
+      "Time to make var_names unique:  23.616740942001343\n",
+      "Time to create X table structure:  0.24507379531860352\n",
+      "Starting backed mode X table data insert. Total rows: 2700\n",
+      "Processed chunk 0-2699 in 4.261802673339844 seconds\n",
+      "\n",
+      "Too close for missiles, switching to guns\n",
+      "Creating X table from buffer file.\n",
+      "This may take a while...\n",
+      "Time to create X table from buffer: 67.55312919616699\n",
+      "Finished inserting X data.\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "<MakeDb.MakeDb at 0x700fa0bc66c0>"
+       "<AnnSQL.MakeDb.MakeDb at 0x7e32c02339b0>"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "#this delete command is for testing purposes only. Remove this line in production \n",
+    "adata = sc.read_h5ad(\"data/pbmc3k_raw.h5ad\", backed=\"r\")\n",
+    "\n",
+    "#this delete command is for testing purposes only. \n",
     "if os.path.exists(\"db/pbmc3k.asql\"):\n",
     "\tos.remove(\"db/pbmc3k.asql\")\n",
+    "if os.path.exists(\"db/pbmc3k.asql.wal\"):\n",
+    "\tos.remove(\"db/pbmc3k.asql.wal\")\n",
+    "\n",
+    "#high system memory (>24Gb)\n",
+    "MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\", chunk_size=5000)\n",
+    "\n",
+    "# #medium system memory (12-24Gb)\n",
+    "# MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\", chunk_size=2500)\n",
     "\n",
-    "MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\")"
+    "# #low system memory (<12Gb)\n",
+    "# MakeDb(adata=adata, db_name=\"pbmc3k\", db_path=\"db/\", chunk_size=1000, make_buffer_file=True)"
    ]
   },
   {
@@ -117,7 +135,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -133,7 +151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -338,7 +356,7 @@
        "[5 rows x 32739 columns]"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -458,266 +476,6 @@
     "#total counts per gene \n",
     "adata_sql.query(\"SELECT SUM(COLUMNS(*)) FROM (SELECT * EXCLUDE (cell_id) FROM X)\")"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Normalize to 10k reads per library and log transform\n",
-    "Below, we illustrate how to do basic normalization and log transformations using AnnSQL. It's worth nothing that for smaller datasets, there are no runtime benefits over using AnnSQL. For larger datasets that are >50k cells; this method becomes more computationally feasible on a resource limited computer. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Total counts column added\n",
-      "Total counts added\n",
-      "Normalized to 10k and log2\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>cell_id</th>\n",
-       "      <th>MIR1302_10</th>\n",
-       "      <th>FAM138A</th>\n",
-       "      <th>OR4F5</th>\n",
-       "      <th>RP11_34P13_7</th>\n",
-       "      <th>RP11_34P13_8</th>\n",
-       "      <th>AL627309_1</th>\n",
-       "      <th>RP11_34P13_14</th>\n",
-       "      <th>RP11_34P13_9</th>\n",
-       "      <th>AP006222_2</th>\n",
-       "      <th>...</th>\n",
-       "      <th>AL590523_1</th>\n",
-       "      <th>CT476828_1</th>\n",
-       "      <th>PNRC2_1</th>\n",
-       "      <th>SRSF10_1</th>\n",
-       "      <th>AC145205_1</th>\n",
-       "      <th>BAGE5</th>\n",
-       "      <th>CU459201_1</th>\n",
-       "      <th>AC002321_2</th>\n",
-       "      <th>AC002321_1</th>\n",
-       "      <th>total_counts</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>AAACATACAACCAC-1</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>2421.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>AAACATTGAGCTAC-1</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>4903.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>AAACATTGATCAGC-1</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>3149.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>AAACCGTGCTTCCG-1</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>2639.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>AAACCGTGTATGCG-1</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>-16.60964</td>\n",
-       "      <td>981.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>5 rows × 32740 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            cell_id  MIR1302_10   FAM138A     OR4F5  RP11_34P13_7  \\\n",
-       "0  AAACATACAACCAC-1   -16.60964 -16.60964 -16.60964     -16.60964   \n",
-       "1  AAACATTGAGCTAC-1   -16.60964 -16.60964 -16.60964     -16.60964   \n",
-       "2  AAACATTGATCAGC-1   -16.60964 -16.60964 -16.60964     -16.60964   \n",
-       "3  AAACCGTGCTTCCG-1   -16.60964 -16.60964 -16.60964     -16.60964   \n",
-       "4  AAACCGTGTATGCG-1   -16.60964 -16.60964 -16.60964     -16.60964   \n",
-       "\n",
-       "   RP11_34P13_8  AL627309_1  RP11_34P13_14  RP11_34P13_9  AP006222_2  ...  \\\n",
-       "0     -16.60964   -16.60964      -16.60964     -16.60964   -16.60964  ...   \n",
-       "1     -16.60964   -16.60964      -16.60964     -16.60964   -16.60964  ...   \n",
-       "2     -16.60964   -16.60964      -16.60964     -16.60964   -16.60964  ...   \n",
-       "3     -16.60964   -16.60964      -16.60964     -16.60964   -16.60964  ...   \n",
-       "4     -16.60964   -16.60964      -16.60964     -16.60964   -16.60964  ...   \n",
-       "\n",
-       "   AL590523_1  CT476828_1   PNRC2_1  SRSF10_1  AC145205_1     BAGE5  \\\n",
-       "0   -16.60964   -16.60964 -16.60964 -16.60964   -16.60964 -16.60964   \n",
-       "1   -16.60964   -16.60964 -16.60964 -16.60964   -16.60964 -16.60964   \n",
-       "2   -16.60964   -16.60964 -16.60964 -16.60964   -16.60964 -16.60964   \n",
-       "3   -16.60964   -16.60964 -16.60964 -16.60964   -16.60964 -16.60964   \n",
-       "4   -16.60964   -16.60964 -16.60964 -16.60964   -16.60964 -16.60964   \n",
-       "\n",
-       "   CU459201_1  AC002321_2  AC002321_1  total_counts  \n",
-       "0   -16.60964   -16.60964   -16.60964        2421.0  \n",
-       "1   -16.60964   -16.60964   -16.60964        4903.0  \n",
-       "2   -16.60964   -16.60964   -16.60964        3149.0  \n",
-       "3   -16.60964   -16.60964   -16.60964        2639.0  \n",
-       "4   -16.60964   -16.60964   -16.60964         981.0  \n",
-       "\n",
-       "[5 rows x 32740 columns]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#get all gene names \n",
-    "gene_names = adata_sql.query(f\"Describe X\")['column_name'][1:].values\n",
-    "\n",
-    "#add a total counts column\n",
-    "adata_sql.query(f\"ALTER TABLE X ADD COLUMN total_counts FLOAT DEFAULT 0;\")\n",
-    "print(\"Total counts column added\")\n",
-    "\n",
-    "#iterates gene_names in chunks\n",
-    "chunk_size = 990 #Ddb limited to 1k\n",
-    "for i in range(0, len(gene_names), chunk_size):\n",
-    "\tchunk = gene_names[i:i+chunk_size]\n",
-    "\tchunk = \" + \".join(chunk) + \" + total_counts\"\n",
-    "\tadata_sql.update_query(f\"UPDATE X SET total_counts = ({chunk});\", suppress_message=True)\n",
-    "print(\"Total counts added\")\n",
-    "\n",
-    "#normalize to 10k and log2\n",
-    "chunk_size = 200  #reduces db memory usage\n",
-    "for i in range(0, len(gene_names), chunk_size):\n",
-    "\tupdates = []\n",
-    "\tchunk = gene_names[i:i + chunk_size]\n",
-    "\tfor gene in chunk:\n",
-    "\t\tupdates.append(f\"{gene} = LOG2(({gene} / total_counts) * 1e4 + 1e-5)\")\n",
-    "\tupdate_query = f\"UPDATE X SET {', '.join(updates)}\"\n",
-    "\tadata_sql.update_query(update_query, suppress_message=True)\n",
-    "print(\"Normalized to 10k and log2\")\n",
-    "\n",
-    "#show the first 5 rows\n",
-    "adata_sql.query(\"SELECT * FROM X LIMIT 5\")"
-   ]
   }
  ],
  "metadata": {
 
@@ -238,6 +238,7 @@ adata_sql.query("SELECT corr(ITGB2,SSU72) as correlation FROM adata WHERE bulk_l
           <li><code>db_name</code>: Name for the database (required)</li>
           <li><code>db_path</code>: Path to store the database (default: 'db/')</li>
 		  <li><code>chunk_size</code>: When opening AnnData in backed mode, the amount of chunks to insert at one time. Lower the value for low-memory systems (default: 5000)</li>
+		  <li><code>make_buffer_file</code>: For memory errors on very low memory system, set this flag to true. It will mitigate the DuckDb high memory leak while inserting chunks by creating a buffer file. Be sure to have hard drive space x2 the size of your AnnData object available. (default: False)</li></li>
 		  <li><code>layers</code>: List (optional. default: ["X", "obs", "var", "var_names", "obsm", "varm", "obsp", "uns"]).<i>The layers of the Anndata object to build into the database. For larger datasets, it may be beneficial to only include the layers you're interested in querying.</i></li>
           <li><code>create_basic_indexes</code>: Build indexed on cell_id (optional. default: False)</li>
 		  <li><code>create_all_indexes</code>: Boolean (optional. default: False). <i>Warning: Runtime can be significant when building.</i></li>