research-visualizer/survey.py at master · yegnau/research-visualizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# encoding: utf-8
import pandas as pd
import numpy as np

class Survey:
    default_empty_code = 6101
    default_empty_label = '(Not available)'

    def __init__(self, data_values, data_labels, data_variables):
        """
        Initialization

        Parameters
        ----------
        data_values : pandas.core.frame.DataFrame
            Pandas DataFrame with numerical survey responses
            Loaded from CSV that is created by running the following command in PSPP:
                SAVE TRANSLATE /OUTFILE="data_values.csv" /TYPE=CSV /FIELDNAMES.
        data_labels: pandas.core.frame.DataFrame
            Pandas DataFrame with text survey responses
            Loaded from CSV that is created by running the following command in PSPP:
                SAVE TRANSLATE /OUTFILE="data_labels.csv" /TYPE=CSV /FIELDNAMES /CELLS=LABELS.
        data_variables: str
            String with survey questions and labels
            Created by running the following command in PSPP:
                DISPLAY LABELS.
        """
        self.data_values = self.process_values(data_values)
        self.data_labels = self.process_labels(data_labels)
        self.data_variables = data_variables # Used to create child objects (survey subsets)
        self.variable_label_mapping = self.parse_variable_label_mapping(data_variables)
        self.add_weights_column()
        self.total_weight = self.calculate_total_weight() # Total weight is opdated automatically when different weights are set
        self.value_label_mapping = self.calculate_value_label_mapping()
        assert len(data_values) == len(data_labels), "Something went wrong, values and labels are not equal"
        self.num_rows, self.num_cols = self.data_values.shape

    def process_values(self, data_values, empty_code = default_empty_code):
        """
        Cleans up the DataFrame with values and makes it numerical only

        Parameters
        ----------
        data_values : pandas.core.frame.DataFrame
            Pandas DataFrame with numerical survey responses
        """
        data_values = data_values.replace(' ', np.nan)
        data_values = data_values.dropna(how='all') # When PSPP exports to CSV, it may sometimes create empty rows
        data_values = data_values.replace(np.nan, empty_code)
        data_values = self.convert_to_numbers(data_values, empty_code=empty_code)
        return data_values

    def convert_to_numbers(self, data_values, empty_code = default_empty_code, dtype = 'int32'):
        """
        Makes the DataFrame numerical only

        Parameters
        ----------
        data_values : pandas.core.frame.DataFrame
            Pandas DataFrame with numerical survey responses
        """
        num_cols = data_values.shape[1]
        for i in range(0, num_cols):
            try:
                column = data_values.iloc[:, i].astype(dtype)
            except ValueError:
                # Column contains strings, making it one-hot-encoded. E.g. "Answer1; Answer2; Answer3; Answer1; Answer4" becomes "1; 2; 3; 1; 4"
                column = self.build_one_hot_column(data_values.iloc[:, i], empty_code=empty_code)
                column = column.astype(dtype)
            data_values.iloc[:, i] = column.values
        return data_values

    def build_one_hot_column(self, column, empty_code = default_empty_code):
        """
        Building a one-hot-encoded column from a string column

        Parameters
        ----------
        column : pandas.core.frame.DataFrame
            Column of Pandas DataFrames
        """
        unique_values = np.unique(column)
        for i, value in enumerate(unique_values):
            if value != empty_code:
                column = column.replace(value, i) # Replacing each unique value with its index
        return column

    def process_labels(self, data_labels, empty_code = default_empty_label):
        """
        Cleans up the DataFrame with labels

        Parameters
        ----------
        data_labels : pandas.core.frame.DataFrame
            Pandas DataFrame with text survey responses
        """
        data_labels = data_labels.replace(' ', np.nan)
        data_labels = data_labels.dropna(how='all') # When PSPP exports to CSV, it may sometimes create empty rows
        data_labels = data_labels.replace(np.nan, empty_code)
        return data_labels

    def parse_variable_label_mapping(self, data_variables):
        """
        Builds a dictionary of variable labels

        Parameters
        ----------
        data_variables : str
            String with survey questions and labels (SPSS/PSPP output)
            Example:
                Variable            Label                                              Position
                ═══════════════════════════════════════════════════════════════════════════════
                                 S5 What field do you work in?                                7
                                 S6 Which of the following better describes your              8
                                    current employment status?
                                 S7 You mentioned you’re currently a full-time student        9
                                    In which of the following levels of school are you
                                    currently enrolled?
            The goal is to extract combinations like {S5 : "What field do you work in"?} and return the resulting dictionary
        """
        coding = {}
        code, description = 'foo', 'bar'
        for line in data_variables.split("\n"):
            line = line.strip()
            if line[0:8] == 'Variable' or '═' in line or line == '':
                # Skipping lines than don't belong to labels
                continue
            if line[-8:].strip().isdigit():
                # If the last 8 symbols of the line are digits (and spaces), then it must be the beginning of a new variable
                code = line[0:line.find(' ')]
                description = line[line.find(' '):-8].strip()
                coding[code] = description
            else:
                # If the line doesn't end with a digit, then it needs to be attached to description
                description += ' ' + line
                coding[code] = description
        return coding

    def drop_rows(self, exclude_values):
        """
        Searches a given column for given values, and removes the respective row if one of the values is found

        Parameters
        ----------
        exclude_values : list
            List of values for which the row needs to be removed
        column_index : int
            Index of the column where to search
        """
        if len(exclude_values) == 0:
            return True
        num_rows = self.data_values.shape[0]
        row_indices = [] # Indices of rows that will be removed from self.data_values and self.data_labels
        column_names = exclude_values.keys()
        for column_name in column_names:
            for row in range(num_rows):
                if self.data_values[column_name].values[row] in exclude_values[column_name]:
                    row_indices.append(row)
        self.data_values.drop(row_indices, axis=0, inplace=True)
        self.data_labels.drop(row_indices, axis=0, inplace=True)
        # Reindexing
        self.data_values.index = range(len(self.data_values))
        self.data_labels.index = range(len(self.data_labels))
        return True

    def add_weights_column(self):
        """
        Adds a new "Weights" column filled with ones
        """
        num_rows = self.data_values.shape[0]
        new_column = np.ones(num_rows)
        self.data_values['Weights'] = new_column
        self.data_labels['Weights'] = new_column
        self.variable_label_mapping['Weights'] = "Weights"
        return True

    def calculate_total_weight(self):
        """
        Sums weights from the "Weights" column
        """
        total_weight = self.data_values['Weights'].sum()
        return total_weight

    def set_weights(self, weights):
        """
        Goes through values of a given column, and updates the column "Weights" with weights that each row must have

        Parameters
        ----------
        weights : dict
            Dictionary formatted as {column_name: {value1: weight1, ..., valueN: weightN}}
        """
        if len(weights) == 0:
            return True
        column_name = weights.keys()[0]
        column = self.data_values[column_name]
        # Weights are added so that total weight doesn't change
        total_weight = sum(weights[column_name].values())
        new_column = column.copy()
        unique_values = np.unique(column)
        for value in unique_values:
            value = str(value) # Valid JSON allows string keys only
            new_value = 1.0 * len(unique_values) * weights[column_name][value] / total_weight
            new_column = new_column.replace(value, new_value)
        self.data_values['Weights'] = new_column.values
        self.data_labels['Weights'] = new_column.values
        self.total_weight = self.calculate_total_weight()
        return True

    def calculate_value_label_mapping(self):
        """
        Calculates mapping from values in self.data_values to self.data_labels

        Returns a dictionary formatted as {column_name: {value1: label1, ..., valueN: labelN}}
        """
        value_label_mapping = {}
        column_names = self.data_values.columns
        for column_name in column_names:
            values, value_indices = np.unique(self.data_values[column_name], return_index = True) # E.g. ([1, 2, 3, 4, 5], [ 4, 36, 10,  0, 60])
            labels, label_indices  = np.unique(self.data_labels[column_name], return_index = True) # E.g. (['red', 'orange', 'yellow', 'green', 'blue'], [ 4, 10, 60,  0, 36])
            # Sorting by indices, so that unique values and labels appear in the same order
            values_sorted = [value[1] for value in sorted(zip(value_indices, values))]
            labels_sorted = [label[1] for label in sorted(zip(label_indices, labels))]
            column_mapping = {}
            for index, value in enumerate(values_sorted):
                column_mapping[value] = labels_sorted[index]
            value_label_mapping[column_name] = column_mapping
        return value_label_mapping

    def get_column_value_counts(self, column_name, use_weights = True):
        """
        Calculates the number of different answers in a given column

        Returns a list of tuples: [(answer1, count1), ..., (answerN, countN)]

        Parameters
        ----------
        column_name : str
            Name of the column to be analyzed
        """
        column_value_counts = []
        for value in self.value_label_mapping[column_name].keys(): # Using self.value_label_mapping because it's sorted
            value_vector = (self.data_values[column_name] == value)
            if use_weights:
                weighted_count = self.data_values['Weights'][value_vector].sum()
                weighted_count = int(round(weighted_count))
                column_value_counts.append((value, weighted_count))
            else:
                count = value_vector.sum()
                column_value_counts.append((value, count))
        column_value_counts.sort()
        return column_value_counts

    def get_subset_survey(self, filter):
        """
        Filters the survey using a given rule and creates another Survey object

        Parameters
        ----------
        filter : dict
            Dictionary formatted as {column_name: value}
        """
        for variable, value in filter.iteritems():
            rule = (self.data_values[variable] == value)
            subset_data_values = self.data_values[rule]
            subset_data_labels = self.data_labels[rule]
        return Survey(subset_data_values, subset_data_labels, self.data_variables)