Merge pull request #462 from stefanseefeld/dask

kwmsmith · web-flow · commit f3c2012b3df8 · 2016-09-09T10:15:43.000-05:00
Use dask to parallelize chunk computation as much as possible.
diff --git a/docs/source/whatsnew/0.6.0.txt b/docs/source/whatsnew/0.6.0.txt
@@ -0,0 +1,45 @@
+Release |version|
+-----------------
+
+:Release: |version|
+:Date: TBD
+
+New Features
+------------
+
+None
+
+Experimental Features
+---------------------
+
+.. warning::
+
+   Experimental features are subject to change.
+
+None
+
+New Backends
+------------
+
+None
+
+Improved Backends
+-----------------
+
+* Optimized `chunks` backend to allow parallel evaluation using `dask`
+  whenever the iteratable is a list of callables.
+
+API Changes
+-----------
+
+None
+
+Bug Fixes
+---------
+
+None
+
+Miscellaneous
+-------------
+
+None
diff --git a/odo/backends/csv.py b/odo/backends/csv.py
@@ -30,6 +30,7 @@
 from ..temp import Temp
 from ..numpy_dtype import dshape_to_pandas
 from .pandas import coerce_datetimes
+from functools import partial
 
 dialect_terms = '''delimiter doublequote escapechar lineterminator quotechar
 quoting skipinitialspace strict'''.split()
@@ -321,11 +322,8 @@ def CSV_to_chunks_of_dataframes(c, chunksize=2 ** 20, **kwargs):
     else:
         rest = []
 
-    def _():
-        yield first
-        for df in rest:
-            yield df
-    return chunks(pd.DataFrame)(_)
+    data = [first] + rest
+    return chunks(pd.DataFrame)(data)
 
 
 @discover.register(CSV)
@@ -368,10 +366,8 @@ def resource_glob(uri, **kwargs):
 @convert.register(chunks(pd.DataFrame), (chunks(CSV), chunks(Temp(CSV))),
                   cost=10.0)
 def convert_glob_of_csvs_to_chunks_of_dataframes(csvs, **kwargs):
-    def _():
-        return concat(convert(chunks(pd.DataFrame), csv, **kwargs)
-                      for csv in csvs)
-    return chunks(pd.DataFrame)(_)
+    data = [partial(convert, chunks(pd.DataFrame), csv, **kwargs) for csv in csvs]
+    return chunks(pd.DataFrame)(data)
 
 
 @convert.register(Temp(CSV), (pd.DataFrame, chunks(pd.DataFrame)))
diff --git a/odo/chunks.py b/odo/chunks.py
@@ -5,6 +5,7 @@
 from toolz import memoize, first, peek
 from datashape import discover, var
 from .utils import cls_name, copydoc
+from dask.threaded import get as dsk_get
 
 
 class Chunks(object):
@@ -34,8 +35,18 @@ def __init__(self, data):
     def __iter__(self):
         if callable(self.data):
             return self.data()
-        else:
-            return iter(self.data)
+        elif (isinstance(self.data, list) and
+              len(self.data) and
+              callable(self.data[0])):
+            # If this is a set of callables, evaluate
+            # them using dask before returning an iterator for them
+            p = []
+            dsk = {}
+            for i, f in enumerate(self.data):
+                dsk['p%d'%i] = (f,)
+                p.append('p%d'%i)
+            self.data = dsk_get(dsk, p)
+        return iter(self.data)
 
 
 @memoize
diff --git a/odo/convert.py b/odo/convert.py
@@ -11,6 +11,7 @@
 from .chunks import chunks, Chunks
 from .numpy_dtype import dshape_to_numpy
 from .utils import records_to_tuples
+from functools import partial
 
 
 convert = NetworkDispatcher('convert')
@@ -207,34 +208,37 @@ def _():
 def iterator_to_DataFrame_chunks(seq, chunksize=1024, **kwargs):
     seq2 = partition_all(chunksize, seq)
 
-    if kwargs.get('add_index'):
-        mkindex = _add_index
-    else:
-        mkindex = _ignore_index
-
+    add_index = kwargs.get('add_index', False)
+    if not add_index:
+        # Simple, we can dispatch to dask...
+        f = lambda d: convert(pd.DataFrame, d, **kwargs)
+        data = [partial(f, d) for d in seq2]
+        if not data:
+            data = [convert(pd.DataFrame, [], **kwargs)]
+        return chunks(pd.DataFrame)(data)
+
+    # TODO: Decide whether we should support the `add_index` flag at all.
+    # If so, we need to post-process the converted DataFrame objects sequencially,
+    # so we can't parallelize the process.
     try:
         first, rest = next(seq2), seq2
     except StopIteration:
         def _():
             yield convert(pd.DataFrame, [], **kwargs)
     else:
         df = convert(pd.DataFrame, first, **kwargs)
-        df1, n1 = mkindex(df, 0)
+        df1, n1 = _add_index(df, 0)
 
         def _():
             n = n1
             yield df1
             for i in rest:
                 df = convert(pd.DataFrame, i, **kwargs)
-                df, n = mkindex(df, n)
+                df, n = _add_index(df, n)
                 yield df
     return chunks(pd.DataFrame)(_)
 
 
-def _ignore_index(df, start):
-    return df, start
-
-
 def _add_index(df, start, _idx_type=getattr(pd, 'RangeIndex',
                                             compose(pd.Index, np.arange))):
     stop = start + len(df)