Merge pull request #20 from target/dev

DavidJBianco · web-flow · commit daf9cf7f16e3 · 2021-03-23T15:11:05.000-04:00
Merge test changes and data read post-processing into MAIN
diff --git a/README.md b/README.md
@@ -362,8 +362,28 @@ or
 ```python
 df = read_csv("*.csv")
 ```
+#### Post-Processing the Input Data
+Both `read_json()` and `read_csv()` support an optional `post_function` parameter, which allows you to specify a function to post-process the data after each individual file is read in, before it is merged into the final returned DataFrame. For example, you might want to split or combine columns, or compute a new value from existing data.  
 
-Consult the Pandas documentation for information on supported options for `read_csv()` and `read_json()`.
+Start by creating a post-processing function according to the following prototype:
+
+```python
+def my_post_processor(df, filename):
+    # do some stuff 
+
+    return df
+```
+
+When called, the `df` parameter will be a DataFrame containing the chunk of data just read, and the `filename` parameter will be the name of the file it came from, which will be different for each chunk. **IT IS IMPORTANT THAT YOU RETURN `df` no matter whether you modified the input DataFrame or not.**
+
+Once you have defined the post-processor function, you can invoke it during your call to `read_json()` or `read_csv()` like so:
+
+```python
+df = read_csv("*.csv", post_function=my_post_processor)
+```
+
+#### Additional Read Options
+Consult the Pandas documentation for information on other supported options for `read_csv()` and `read_json()`.
 
 ### Normalizing nesting dicts and lists
 
diff --git a/huntlib/__init__.py b/huntlib/__init__.py
@@ -4,7 +4,7 @@
 from huntlib.decorators import future_warning
 import warnings 
 
-__all__ = ['entropy', 'entropy_per_byte', 'promptCreds', 'edit_distance', 'flatten']
+__all__ = ['entropy', 'entropy_per_byte', 'promptCreds', 'edit_distance']
 
 @future_warning("The huntlib.entropy() function has been moved to huntlib.util.entropy(). Please update your code. This compatibility will be removed in a future release.")
 def entropy(*args, **kwargs):
diff --git a/huntlib/data.py b/huntlib/data.py
@@ -7,37 +7,36 @@
 
 __all__ = ['read_json', 'read_csv', 'flatten']
 
-def _read_multi(func=None, path_or_buf=None, *args, **kwargs):
+def _read_multi(read_function=None, path_or_buf=None, post_function=None, *args, **kwargs):
     """
     Given a wildcard filename pattern (which may be just a single static
     filename), expand the wildcard and read all the files into a single
     pandas DataFrame() object.  
 
-    :param func: Reference to the function which will read an individual data file (e.g., pd.read_csv)
+    :param read_function: Reference to the function which will read an individual data file (e.g., pd.read_csv)
     :param path_or_buf: A wildcard specifying which file(s) to read
-    :type func: A reference to a valid function which returns a pd.DataFrame() object
+    :type read_function: A reference to a valid function which returns a pd.DataFrame() object
     :type path_or_buf: A `str`, `bytes` or os.PathLike object
     """
 
     # Make sure we have specified a read function.  This should never
     # be called by an end user, so our code should always include one,
     # but you never know.
-    if not func:
-        raise ValueError("Must specify a read function in the `func` arg.")
+    if not read_function:
+        raise ValueError("Must specify a read function in the `read_function` arg.")
 
     # Make sure we have a valid type of data for `path_or_buf` in glob(),
     # otherwise raise the same exception the original pandas function 
     # would
     if not type(path_or_buf) in [str, bytes, os.PathLike]:
         raise ValueError(f"Invalid file path or buffer object type: {type(path_or_buf)}")
 
-    combined_df = pd.concat(
-        [
-            func(f, *args, **kwargs)
-            for f in glob(path_or_buf)
-        ],
-        ignore_index=True
-    )
+    combined_df = pd.DataFrame() 
+    for f in glob(path_or_buf):
+        temp_df = read_function(f, *args, **kwargs)
+        if post_function:
+            temp_df = post_function(temp_df, f)
+        combined_df = combined_df.append(temp_df, ignore_index=True)
 
     return combined_df
 
@@ -48,7 +47,7 @@ def read_json(path_or_buf=None, *args, **kwargs):
     """
 
     return _read_multi(
-        func=pd.read_json,
+        read_function=pd.read_json,
         path_or_buf=path_or_buf,
         *args,
         **kwargs
@@ -61,7 +60,7 @@ def read_csv(path_or_buf=None, *args, **kwargs):
     """
 
     return _read_multi(
-        func=pd.read_csv,
+        read_function=pd.read_csv,
         path_or_buf=path_or_buf,
         *args,
         **kwargs
diff --git a/huntlib/util.py b/huntlib/util.py
@@ -135,6 +135,18 @@ def benfords(numbers):
     '''
 
     def _first_digit(i: float):
+        # This doesn't apply to zeros!
+        if i == 0:
+            return np.nan
+        # Make negative numbers positive
+        if i < 0:
+            i = abs(i)
+        # If the number is between 0 and 1, multiply by 10 until it becomes > 1
+        # so the repeated divisions will work
+        elif i < 1:
+            while i < 1:
+                i *= 10
+
         while i >= 10:
             i //= 10
         return trunc(i)
@@ -157,7 +169,7 @@ def _first_digit(i: float):
         numbers = numbers.values
 
     numbers = pd.DataFrame(numbers, columns=['numbers'])
-    numbers['digits'] = numbers['numbers'].apply(_first_digit)
+    numbers['digits'] = numbers['numbers'].apply(_first_digit).dropna()
 
     counts = numbers['digits'].value_counts()
 
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
     long_description = fh.read()
 
 setup(name='huntlib',
-      version='0.5.1',
+      version='0.5.3',
       description='A Python library to help with some common threat hunting data analysis operations',
       long_description=long_description,
       long_description_content_type="text/markdown",
diff --git a/test-infra.sh b/test-infra.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Start / Stop the infra needed to do tests
+case "$1" in
+    "start") 
+        # Call ourselves to stop any running containers and reset the test environment
+        $0 stop
+
+        echo "****** Sleeping to allow containers to stop ******"
+        sleep 10
+
+        echo "****** Creating Elastic TLS Certs ******"
+        rm -rf support/certs
+        mkdir support/certs
+        docker run -it --name create_elastic_certs -e CERTS_DIR=/usr/share/elasticsearch/config/certificates -v `pwd`/support/certs:/certs -v `pwd`/support/certificates:/usr/share/elasticsearch/config/certificates docker.elastic.co/elasticsearch/elasticsearch:7.6.2 bash -c "yum install -y -q -e 0 unzip; ls -la /certs ; ls -la /usr/share/elasticsearch/config/certificates ;if [[ ! -f /certs/bundle.zip ]]; then bin/elasticsearch-certutil cert --silent --pem --in config/certificates/instances.yml -out /certs/bundle.zip ; unzip /certs/bundle.zip -d /certs; fi; chown -R 1000:0 /certs"
+
+        echo "****** Starting Splunk Enterprise via Docker ******"
+        docker run -it -d --name splunk_test -e SPLUNK_START_ARGS=--accept-license -e SPLUNK_LICENSE_URI=/tmp/splunk.lic -e SPLUNK_PASSWORD=testpass -p 8000:8000 -p 8089:8089 -v `pwd`/support/Splunk.License:/tmp/splunk.lic -v `pwd`/support/test-data.json:/tmp/test-data.json -v `pwd`/support/test-data-large.json:/tmp/test-data-large.json splunk/splunk:latest
+        echo "****** Starting Elastic via Docker ******"
+        docker run -d -it --name elastic_test -e node.name=es01 -e cluster.initial_master_nodes=es01 -e xpack.license.self_generated.type=trial -e xpack.security.enabled=true -e xpack.security.http.ssl.enabled=true -e xpack.security.http.ssl.key=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.key -e xpack.security.http.ssl.certificate_authorities=/usr/share/elasticsearch/config/certificates/ca/ca.crt -e xpack.security.http.ssl.certificate=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.crt -v `pwd`/support/certs:/usr/share/elasticsearch/config/certificates -p 9200:9200 docker.elastic.co/elasticsearch/elasticsearch:7.6.2
+        echo "****** Sleeping to allow containers to start ******"
+        sleep 60
+
+        echo "****** Loading Splunk data ******"
+        docker exec -it splunk_test sudo /opt/splunk/bin/splunk list user -auth admin:testpass
+        docker exec -it splunk_test sudo /opt/splunk/bin/splunk add index bigdata
+        docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data.json -index main
+        docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data-large.json -index bigdata
+
+        echo "****** Loading Elastic data ******"
+        docker exec elastic_test bin/elasticsearch-setup-passwords auto --batch --url https://localhost:9200 | grep "PASSWORD elastic" | cut -d" " -f 4 > /tmp/elastic_pass.txt
+        echo \{\"password\": \"testpass\"\} | curl -u elastic:`cat /tmp/elastic_pass.txt` --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST https://localhost:9200/_security/user/elastic/_password  --data-binary @-
+        curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-elastic.json > /dev/null
+        curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-large-elastic.json > /dev/null
+    ;;
+    "stop") 
+        echo "****** Stopping any previous Splunk container ******"
+        docker kill splunk_test
+        docker stop splunk_test
+        docker rm splunk_test
+        echo "****** Stopping any previous Elastic containers ******"
+        docker kill create_elastic_certs
+        docker stop create_elastic_certs
+        docker rm create_elastic_certs
+        docker kill elastic_test
+        docker stop elastic_test
+        docker rm elastic_test
+        echo "****** Cleaning up artifacts ******"
+        rm -rf support/certs
+        rm -f /tmp/elastic_pass.txt    
+     ;;
+    *) 
+        echo "Unknown command: $1"
+        exit -1
+    ;;
+esac
+
diff --git a/tests/test_benfords_law.py b/tests/test_benfords_law.py
@@ -43,3 +43,21 @@ def test_benfords_benfords(self):
             0.99,
             f"Chi square p-value was too low."
         )
+
+    def test_benfords_floats_and_negatives(self):
+        nums = [1, 1.0, 0.001, 1, 1, 1, 1, 1, .02, 2, 2, 2.99,
+                3, 3.14159, 3, 4, 4, -5, 5, 6, 6, 7, 7, -8, 9]
+
+        chi2, p, counts = benfords(nums)
+
+        self.assertLessEqual(
+            chi2,
+            0.05,
+            f"The chosen distribution did not conform to Benford's law, but should have. (chisquare={chi2})"
+        )
+
+        self.assertGreaterEqual(
+            p,
+            0.99,
+            f"Chi square p-value was too low."
+        )
diff --git a/tests/test_imports.py b/tests/test_imports.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+
+# Just a set of 'tests' to make sure we can import * from
+# all the modules in the package. Guess why we have this??
+
+from huntlib import *
+from huntlib.data import *
+from huntlib.util import *
+from huntlib.elastic import *
+from huntlib.splunk import *
+from huntlib.exceptions import *
+from huntlib.decorators import *
+
diff --git a/tests/test_multi_reads.py b/tests/test_multi_reads.py
@@ -22,4 +22,42 @@ def test_read_csv(self):
         self.assertEqual(rows, 6, "The resulting DataFrame had the wrong number of rows.")
         self.assertEqual(df.index.nunique(), 6, "DataFrame index values are not unique.")
 
+    def test_read_json_post_process(self):
+        def _post_process(df, filename):
+            if 'ts' in df.columns:
+                df = df.drop('ts', axis='columns')
+            df['filename'] = filename 
+            return df
+
+        df = huntlib.data.read_json(
+            "support/*.json", 
+            lines=True,
+            post_function=_post_process
+        )
 
+        (rows, cols) = df.shape
+
+        self.assertEqual(cols, 6, "The resulting DataFrame had the wrong number of columns.")
+        self.assertEqual(rows, 3000015, "The resulting DataFrame had the wrong number of rows.")
+        self.assertEqual(df.index.nunique(), 3000015, "DataFrame index values are not unique.")
+        self.assertNotIn('ts', df.columns, "The 'ts' field was present, but should have been dropped in post processing.")
+        self.assertIn('filename', df.columns, "The 'filename' field should have been created in post processing, but was not present.")
+
+    def test_read_csv_post_process(self):
+        def _post_process(df, filename):
+            if 'c' in df.columns:
+                df = df.drop('c', axis='columns')
+            df['filename'] = 'filename'
+            return df
+
+        df = huntlib.data.read_csv(
+            "support/*.csv",
+            post_function=_post_process
+        )
+        (rows, cols) = df.shape
+
+        self.assertEqual(cols, 3, "The resulting DataFrame had the wrong number of columns.")
+        self.assertEqual(rows, 6, "The resulting DataFrame had the wrong number of rows.")
+        self.assertEqual(df.index.nunique(), 6, "DataFrame index values are not unique.")
+        self.assertNotIn('c', df.columns, "The 'c' field was present, but should have been dropped in post processing.")
+        self.assertIn('filename', df.columns, "The 'filename' field should have been created in post processing, but was not present.")
diff --git a/tox.ini b/tox.ini
@@ -21,50 +21,13 @@ whitelist_externals =
     mkdir
 
 commands_pre =
-    echo "****** Stopping any previous Splunk container ******"
-    bash -c 'docker kill splunk_test ; docker stop splunk_test ; docker rm splunk_test ; true'
-    echo "****** Stopping any previous Elastic containers ******"
-    bash -c 'docker kill create_elastic_certs; docker stop create_elastic_certs; docker rm create_elastic_certs; true'
-    bash -c 'docker kill elastic_test; docker stop elastic_test; docker rm elastic_test; true'
-    sleep 10
-
-    echo "****** Creating Elastic TLS Certs ******"
-    rm -rf support/certs
-    mkdir support/certs
-    bash -c 'docker run -it --name create_elastic_certs -e CERTS_DIR=/usr/share/elasticsearch/config/certificates -v `pwd`/support/certs:/certs -v `pwd`/support/certificates:/usr/share/elasticsearch/config/certificates docker.elastic.co/elasticsearch/elasticsearch:7.6.2 bash -c "yum install -y -q -e 0 unzip; ls -la /certs ; ls -la /usr/share/elasticsearch/config/certificates ;if [[ ! -f /certs/bundle.zip ]]; then bin/elasticsearch-certutil cert --silent --pem --in config/certificates/instances.yml -out /certs/bundle.zip; unzip /certs/bundle.zip -d /certs; fi; chown -R 1000:0 /certs"'
-
-    echo "****** Starting Splunk Enterprise via Docker ******"
-    bash -c 'docker run -it -d --name splunk_test -e SPLUNK_START_ARGS=--accept-license -e SPLUNK_LICENSE_URI=/tmp/splunk.lic -e SPLUNK_PASSWORD=testpass -p 8000:8000 -p 8089:8089 -v `pwd`/support/Splunk.License:/tmp/splunk.lic -v `pwd`/support/test-data.json:/tmp/test-data.json -v `pwd`/support/test-data-large.json:/tmp/test-data-large.json splunk/splunk:latest'
-    echo "****** Starting Elastic via Docker ******"
-    bash -c 'docker run -d -it --name elastic_test -e node.name=es01 -e cluster.initial_master_nodes=es01 -e xpack.license.self_generated.type=trial -e xpack.security.enabled=true -e xpack.security.http.ssl.enabled=true -e xpack.security.http.ssl.key=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.key -e xpack.security.http.ssl.certificate_authorities=/usr/share/elasticsearch/config/certificates/ca/ca.crt -e xpack.security.http.ssl.certificate=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.crt -v `pwd`/support/certs:/usr/share/elasticsearch/config/certificates -p 9200:9200 docker.elastic.co/elasticsearch/elasticsearch:7.6.2'
-    echo "****** Sleeping to allow containers to start ******"
-    sleep 60
-
-    echo "****** Loading Splunk data ******"
-    bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk list user -auth admin:testpass'
-    bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add index bigdata'
-    bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data.json -index main'
-    bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data-large.json -index bigdata'
-
-    echo "****** Loading Elastic data ******"
-    bash -c 'docker exec elastic_test bin/elasticsearch-setup-passwords auto --batch --url https://localhost:9200 | grep "PASSWORD elastic" | cut -d" " -f 4 > /tmp/elastic_pass.txt' 
-    bash -c 'echo \{\"password\": \"testpass\"\} | curl -u elastic:`cat /tmp/elastic_pass.txt` --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST https://localhost:9200/_security/user/elastic/_password  --data-binary @-'
-    bash -c 'curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-elastic.json > /dev/null'
-    bash -c 'curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-large-elastic.json > /dev/null'
-
+    bash -c './test-infra.sh start'
     echo "****** Sleeping again to allow time for indexing ******"
     sleep 20
 
 commands = 
     python setup.py test
 
 commands_post = 
-    echo "****** Stopping any previous Splunk container ******"
-    bash -c 'docker kill splunk_test ; docker stop splunk_test ; docker rm splunk_test ; true'
-    echo "****** Stopping any previous Elastic containers ******"
-    bash -c 'docker kill create_elastic_certs; docker stop create_elastic_certs; docker rm create_elastic_certs; true'
-    bash -c 'docker kill elastic_test; docker stop elastic_test; docker rm elastic_test; true'
-    echo "****** Cleaning up artifacts ******"
-    rm -rf support/certs
-    rm -f /tmp/elastic_pass.txt
-
+    bash -c './test-infra.sh stop'
+