Skip to content
This repository was archived by the owner on Jul 16, 2024. It is now read-only.

Commit daf9cf7

Browse files
authored
Merge pull request #20 from target/dev
Merge test changes and data read post-processing into MAIN
2 parents 04f48cb + d8c9ccc commit daf9cf7

File tree

10 files changed

+178
-58
lines changed

10 files changed

+178
-58
lines changed

README.md

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,8 +362,28 @@ or
362362
```python
363363
df = read_csv("*.csv")
364364
```
365+
#### Post-Processing the Input Data
366+
Both `read_json()` and `read_csv()` support an optional `post_function` parameter, which allows you to specify a function to post-process the data after each individual file is read in, before it is merged into the final returned DataFrame. For example, you might want to split or combine columns, or compute a new value from existing data.
365367

366-
Consult the Pandas documentation for information on supported options for `read_csv()` and `read_json()`.
368+
Start by creating a post-processing function according to the following prototype:
369+
370+
```python
371+
def my_post_processor(df, filename):
372+
# do some stuff
373+
374+
return df
375+
```
376+
377+
When called, the `df` parameter will be a DataFrame containing the chunk of data just read, and the `filename` parameter will be the name of the file it came from, which will be different for each chunk. **IT IS IMPORTANT THAT YOU RETURN `df` no matter whether you modified the input DataFrame or not.**
378+
379+
Once you have defined the post-processor function, you can invoke it during your call to `read_json()` or `read_csv()` like so:
380+
381+
```python
382+
df = read_csv("*.csv", post_function=my_post_processor)
383+
```
384+
385+
#### Additional Read Options
386+
Consult the Pandas documentation for information on other supported options for `read_csv()` and `read_json()`.
367387

368388
### Normalizing nesting dicts and lists
369389

huntlib/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from huntlib.decorators import future_warning
55
import warnings
66

7-
__all__ = ['entropy', 'entropy_per_byte', 'promptCreds', 'edit_distance', 'flatten']
7+
__all__ = ['entropy', 'entropy_per_byte', 'promptCreds', 'edit_distance']
88

99
@future_warning("The huntlib.entropy() function has been moved to huntlib.util.entropy(). Please update your code. This compatibility will be removed in a future release.")
1010
def entropy(*args, **kwargs):

huntlib/data.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,37 +7,36 @@
77

88
__all__ = ['read_json', 'read_csv', 'flatten']
99

10-
def _read_multi(func=None, path_or_buf=None, *args, **kwargs):
10+
def _read_multi(read_function=None, path_or_buf=None, post_function=None, *args, **kwargs):
1111
"""
1212
Given a wildcard filename pattern (which may be just a single static
1313
filename), expand the wildcard and read all the files into a single
1414
pandas DataFrame() object.
1515
16-
:param func: Reference to the function which will read an individual data file (e.g., pd.read_csv)
16+
:param read_function: Reference to the function which will read an individual data file (e.g., pd.read_csv)
1717
:param path_or_buf: A wildcard specifying which file(s) to read
18-
:type func: A reference to a valid function which returns a pd.DataFrame() object
18+
:type read_function: A reference to a valid function which returns a pd.DataFrame() object
1919
:type path_or_buf: A `str`, `bytes` or os.PathLike object
2020
"""
2121

2222
# Make sure we have specified a read function. This should never
2323
# be called by an end user, so our code should always include one,
2424
# but you never know.
25-
if not func:
26-
raise ValueError("Must specify a read function in the `func` arg.")
25+
if not read_function:
26+
raise ValueError("Must specify a read function in the `read_function` arg.")
2727

2828
# Make sure we have a valid type of data for `path_or_buf` in glob(),
2929
# otherwise raise the same exception the original pandas function
3030
# would
3131
if not type(path_or_buf) in [str, bytes, os.PathLike]:
3232
raise ValueError(f"Invalid file path or buffer object type: {type(path_or_buf)}")
3333

34-
combined_df = pd.concat(
35-
[
36-
func(f, *args, **kwargs)
37-
for f in glob(path_or_buf)
38-
],
39-
ignore_index=True
40-
)
34+
combined_df = pd.DataFrame()
35+
for f in glob(path_or_buf):
36+
temp_df = read_function(f, *args, **kwargs)
37+
if post_function:
38+
temp_df = post_function(temp_df, f)
39+
combined_df = combined_df.append(temp_df, ignore_index=True)
4140

4241
return combined_df
4342

@@ -48,7 +47,7 @@ def read_json(path_or_buf=None, *args, **kwargs):
4847
"""
4948

5049
return _read_multi(
51-
func=pd.read_json,
50+
read_function=pd.read_json,
5251
path_or_buf=path_or_buf,
5352
*args,
5453
**kwargs
@@ -61,7 +60,7 @@ def read_csv(path_or_buf=None, *args, **kwargs):
6160
"""
6261

6362
return _read_multi(
64-
func=pd.read_csv,
63+
read_function=pd.read_csv,
6564
path_or_buf=path_or_buf,
6665
*args,
6766
**kwargs

huntlib/util.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,18 @@ def benfords(numbers):
135135
'''
136136

137137
def _first_digit(i: float):
138+
# This doesn't apply to zeros!
139+
if i == 0:
140+
return np.nan
141+
# Make negative numbers positive
142+
if i < 0:
143+
i = abs(i)
144+
# If the number is between 0 and 1, multiply by 10 until it becomes > 1
145+
# so the repeated divisions will work
146+
elif i < 1:
147+
while i < 1:
148+
i *= 10
149+
138150
while i >= 10:
139151
i //= 10
140152
return trunc(i)
@@ -157,7 +169,7 @@ def _first_digit(i: float):
157169
numbers = numbers.values
158170

159171
numbers = pd.DataFrame(numbers, columns=['numbers'])
160-
numbers['digits'] = numbers['numbers'].apply(_first_digit)
172+
numbers['digits'] = numbers['numbers'].apply(_first_digit).dropna()
161173

162174
counts = numbers['digits'].value_counts()
163175

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
long_description = fh.read()
77

88
setup(name='huntlib',
9-
version='0.5.1',
9+
version='0.5.3',
1010
description='A Python library to help with some common threat hunting data analysis operations',
1111
long_description=long_description,
1212
long_description_content_type="text/markdown",

test-infra.sh

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/bin/bash
2+
3+
# Start / Stop the infra needed to do tests
4+
case "$1" in
5+
"start")
6+
# Call ourselves to stop any running containers and reset the test environment
7+
$0 stop
8+
9+
echo "****** Sleeping to allow containers to stop ******"
10+
sleep 10
11+
12+
echo "****** Creating Elastic TLS Certs ******"
13+
rm -rf support/certs
14+
mkdir support/certs
15+
docker run -it --name create_elastic_certs -e CERTS_DIR=/usr/share/elasticsearch/config/certificates -v `pwd`/support/certs:/certs -v `pwd`/support/certificates:/usr/share/elasticsearch/config/certificates docker.elastic.co/elasticsearch/elasticsearch:7.6.2 bash -c "yum install -y -q -e 0 unzip; ls -la /certs ; ls -la /usr/share/elasticsearch/config/certificates ;if [[ ! -f /certs/bundle.zip ]]; then bin/elasticsearch-certutil cert --silent --pem --in config/certificates/instances.yml -out /certs/bundle.zip ; unzip /certs/bundle.zip -d /certs; fi; chown -R 1000:0 /certs"
16+
17+
echo "****** Starting Splunk Enterprise via Docker ******"
18+
docker run -it -d --name splunk_test -e SPLUNK_START_ARGS=--accept-license -e SPLUNK_LICENSE_URI=/tmp/splunk.lic -e SPLUNK_PASSWORD=testpass -p 8000:8000 -p 8089:8089 -v `pwd`/support/Splunk.License:/tmp/splunk.lic -v `pwd`/support/test-data.json:/tmp/test-data.json -v `pwd`/support/test-data-large.json:/tmp/test-data-large.json splunk/splunk:latest
19+
echo "****** Starting Elastic via Docker ******"
20+
docker run -d -it --name elastic_test -e node.name=es01 -e cluster.initial_master_nodes=es01 -e xpack.license.self_generated.type=trial -e xpack.security.enabled=true -e xpack.security.http.ssl.enabled=true -e xpack.security.http.ssl.key=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.key -e xpack.security.http.ssl.certificate_authorities=/usr/share/elasticsearch/config/certificates/ca/ca.crt -e xpack.security.http.ssl.certificate=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.crt -v `pwd`/support/certs:/usr/share/elasticsearch/config/certificates -p 9200:9200 docker.elastic.co/elasticsearch/elasticsearch:7.6.2
21+
echo "****** Sleeping to allow containers to start ******"
22+
sleep 60
23+
24+
echo "****** Loading Splunk data ******"
25+
docker exec -it splunk_test sudo /opt/splunk/bin/splunk list user -auth admin:testpass
26+
docker exec -it splunk_test sudo /opt/splunk/bin/splunk add index bigdata
27+
docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data.json -index main
28+
docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data-large.json -index bigdata
29+
30+
echo "****** Loading Elastic data ******"
31+
docker exec elastic_test bin/elasticsearch-setup-passwords auto --batch --url https://localhost:9200 | grep "PASSWORD elastic" | cut -d" " -f 4 > /tmp/elastic_pass.txt
32+
echo \{\"password\": \"testpass\"\} | curl -u elastic:`cat /tmp/elastic_pass.txt` --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST https://localhost:9200/_security/user/elastic/_password --data-binary @-
33+
curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-elastic.json > /dev/null
34+
curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-large-elastic.json > /dev/null
35+
;;
36+
"stop")
37+
echo "****** Stopping any previous Splunk container ******"
38+
docker kill splunk_test
39+
docker stop splunk_test
40+
docker rm splunk_test
41+
echo "****** Stopping any previous Elastic containers ******"
42+
docker kill create_elastic_certs
43+
docker stop create_elastic_certs
44+
docker rm create_elastic_certs
45+
docker kill elastic_test
46+
docker stop elastic_test
47+
docker rm elastic_test
48+
echo "****** Cleaning up artifacts ******"
49+
rm -rf support/certs
50+
rm -f /tmp/elastic_pass.txt
51+
;;
52+
*)
53+
echo "Unknown command: $1"
54+
exit -1
55+
;;
56+
esac
57+

tests/test_benfords_law.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,21 @@ def test_benfords_benfords(self):
4343
0.99,
4444
f"Chi square p-value was too low."
4545
)
46+
47+
def test_benfords_floats_and_negatives(self):
48+
nums = [1, 1.0, 0.001, 1, 1, 1, 1, 1, .02, 2, 2, 2.99,
49+
3, 3.14159, 3, 4, 4, -5, 5, 6, 6, 7, 7, -8, 9]
50+
51+
chi2, p, counts = benfords(nums)
52+
53+
self.assertLessEqual(
54+
chi2,
55+
0.05,
56+
f"The chosen distribution did not conform to Benford's law, but should have. (chisquare={chi2})"
57+
)
58+
59+
self.assertGreaterEqual(
60+
p,
61+
0.99,
62+
f"Chi square p-value was too low."
63+
)

tests/test_imports.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/usr/bin/env python
2+
3+
# Just a set of 'tests' to make sure we can import * from
4+
# all the modules in the package. Guess why we have this??
5+
6+
from huntlib import *
7+
from huntlib.data import *
8+
from huntlib.util import *
9+
from huntlib.elastic import *
10+
from huntlib.splunk import *
11+
from huntlib.exceptions import *
12+
from huntlib.decorators import *
13+

tests/test_multi_reads.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,42 @@ def test_read_csv(self):
2222
self.assertEqual(rows, 6, "The resulting DataFrame had the wrong number of rows.")
2323
self.assertEqual(df.index.nunique(), 6, "DataFrame index values are not unique.")
2424

25+
def test_read_json_post_process(self):
26+
def _post_process(df, filename):
27+
if 'ts' in df.columns:
28+
df = df.drop('ts', axis='columns')
29+
df['filename'] = filename
30+
return df
31+
32+
df = huntlib.data.read_json(
33+
"support/*.json",
34+
lines=True,
35+
post_function=_post_process
36+
)
2537

38+
(rows, cols) = df.shape
39+
40+
self.assertEqual(cols, 6, "The resulting DataFrame had the wrong number of columns.")
41+
self.assertEqual(rows, 3000015, "The resulting DataFrame had the wrong number of rows.")
42+
self.assertEqual(df.index.nunique(), 3000015, "DataFrame index values are not unique.")
43+
self.assertNotIn('ts', df.columns, "The 'ts' field was present, but should have been dropped in post processing.")
44+
self.assertIn('filename', df.columns, "The 'filename' field should have been created in post processing, but was not present.")
45+
46+
def test_read_csv_post_process(self):
47+
def _post_process(df, filename):
48+
if 'c' in df.columns:
49+
df = df.drop('c', axis='columns')
50+
df['filename'] = 'filename'
51+
return df
52+
53+
df = huntlib.data.read_csv(
54+
"support/*.csv",
55+
post_function=_post_process
56+
)
57+
(rows, cols) = df.shape
58+
59+
self.assertEqual(cols, 3, "The resulting DataFrame had the wrong number of columns.")
60+
self.assertEqual(rows, 6, "The resulting DataFrame had the wrong number of rows.")
61+
self.assertEqual(df.index.nunique(), 6, "DataFrame index values are not unique.")
62+
self.assertNotIn('c', df.columns, "The 'c' field was present, but should have been dropped in post processing.")
63+
self.assertIn('filename', df.columns, "The 'filename' field should have been created in post processing, but was not present.")

tox.ini

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -21,50 +21,13 @@ whitelist_externals =
2121
mkdir
2222

2323
commands_pre =
24-
echo "****** Stopping any previous Splunk container ******"
25-
bash -c 'docker kill splunk_test ; docker stop splunk_test ; docker rm splunk_test ; true'
26-
echo "****** Stopping any previous Elastic containers ******"
27-
bash -c 'docker kill create_elastic_certs; docker stop create_elastic_certs; docker rm create_elastic_certs; true'
28-
bash -c 'docker kill elastic_test; docker stop elastic_test; docker rm elastic_test; true'
29-
sleep 10
30-
31-
echo "****** Creating Elastic TLS Certs ******"
32-
rm -rf support/certs
33-
mkdir support/certs
34-
bash -c 'docker run -it --name create_elastic_certs -e CERTS_DIR=/usr/share/elasticsearch/config/certificates -v `pwd`/support/certs:/certs -v `pwd`/support/certificates:/usr/share/elasticsearch/config/certificates docker.elastic.co/elasticsearch/elasticsearch:7.6.2 bash -c "yum install -y -q -e 0 unzip; ls -la /certs ; ls -la /usr/share/elasticsearch/config/certificates ;if [[ ! -f /certs/bundle.zip ]]; then bin/elasticsearch-certutil cert --silent --pem --in config/certificates/instances.yml -out /certs/bundle.zip; unzip /certs/bundle.zip -d /certs; fi; chown -R 1000:0 /certs"'
35-
36-
echo "****** Starting Splunk Enterprise via Docker ******"
37-
bash -c 'docker run -it -d --name splunk_test -e SPLUNK_START_ARGS=--accept-license -e SPLUNK_LICENSE_URI=/tmp/splunk.lic -e SPLUNK_PASSWORD=testpass -p 8000:8000 -p 8089:8089 -v `pwd`/support/Splunk.License:/tmp/splunk.lic -v `pwd`/support/test-data.json:/tmp/test-data.json -v `pwd`/support/test-data-large.json:/tmp/test-data-large.json splunk/splunk:latest'
38-
echo "****** Starting Elastic via Docker ******"
39-
bash -c 'docker run -d -it --name elastic_test -e node.name=es01 -e cluster.initial_master_nodes=es01 -e xpack.license.self_generated.type=trial -e xpack.security.enabled=true -e xpack.security.http.ssl.enabled=true -e xpack.security.http.ssl.key=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.key -e xpack.security.http.ssl.certificate_authorities=/usr/share/elasticsearch/config/certificates/ca/ca.crt -e xpack.security.http.ssl.certificate=/usr/share/elasticsearch/config/certificates/elastic_test/elastic_test.crt -v `pwd`/support/certs:/usr/share/elasticsearch/config/certificates -p 9200:9200 docker.elastic.co/elasticsearch/elasticsearch:7.6.2'
40-
echo "****** Sleeping to allow containers to start ******"
41-
sleep 60
42-
43-
echo "****** Loading Splunk data ******"
44-
bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk list user -auth admin:testpass'
45-
bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add index bigdata'
46-
bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data.json -index main'
47-
bash -c 'docker exec -it splunk_test sudo /opt/splunk/bin/splunk add oneshot /tmp/test-data-large.json -index bigdata'
48-
49-
echo "****** Loading Elastic data ******"
50-
bash -c 'docker exec elastic_test bin/elasticsearch-setup-passwords auto --batch --url https://localhost:9200 | grep "PASSWORD elastic" | cut -d" " -f 4 > /tmp/elastic_pass.txt'
51-
bash -c 'echo \{\"password\": \"testpass\"\} | curl -u elastic:`cat /tmp/elastic_pass.txt` --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST https://localhost:9200/_security/user/elastic/_password --data-binary @-'
52-
bash -c 'curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-elastic.json > /dev/null'
53-
bash -c 'curl -u elastic:testpass --cacert support/certs/ca/ca.crt -H "Content-Type: application/json" -XPOST "https://localhost:9200/_bulk" --data-binary @support/test-data-large-elastic.json > /dev/null'
54-
24+
bash -c './test-infra.sh start'
5525
echo "****** Sleeping again to allow time for indexing ******"
5626
sleep 20
5727

5828
commands =
5929
python setup.py test
6030

6131
commands_post =
62-
echo "****** Stopping any previous Splunk container ******"
63-
bash -c 'docker kill splunk_test ; docker stop splunk_test ; docker rm splunk_test ; true'
64-
echo "****** Stopping any previous Elastic containers ******"
65-
bash -c 'docker kill create_elastic_certs; docker stop create_elastic_certs; docker rm create_elastic_certs; true'
66-
bash -c 'docker kill elastic_test; docker stop elastic_test; docker rm elastic_test; true'
67-
echo "****** Cleaning up artifacts ******"
68-
rm -rf support/certs
69-
rm -f /tmp/elastic_pass.txt
70-
32+
bash -c './test-infra.sh stop'
33+

0 commit comments

Comments
 (0)