-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patharticle_getter.py
More file actions
308 lines (237 loc) · 9.45 KB
/
article_getter.py
File metadata and controls
308 lines (237 loc) · 9.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
"""Logic for querying for individual article's data as opposed to aggregated stats.
License: BSD
"""
import codecs
import csv
import io
import os
import typing
boto_available = False
try:
import boto3 # type: ignore
boto_available = True
except:
boto_available = False
COLS = (
'url',
'published',
'country',
'tags'
)
OBJ_BUCKET = 'gafj-topic-explorer'
OBJ_PATH = 'articles.csv'
class Article:
"""Record with details of a single article's metadata."""
def __init__(self, url: str, title_original: str, title_english: str, published: str,
country: str, keywords: typing.List[str], tags: typing.List[str],
categories: typing.List[str]):
"""Create a new article record.
Args:
url: The URL at which the full article can be found.
title_original: The title before machine translation.
title_english: The title after machine translation.
published: ISO8601 string describing when the article was published.
country: Full human-readable name of the country in which the article was published.
keywords: List of keywords found in the article.
tags: List of tags assigned to the article.
categories: List of categories that the article is part of.
"""
self._url = url
self._title_original = title_original
self._title_english = title_english
self._published = published
self._country = country
self._keywords = keywords
self._tags = tags
self._categories = categories
def get_url(self) -> str:
"""Get the location where the full article can found.
Returns:
The URL at which the full article can be found.
"""
return self._url
def get_title_original(self) -> str:
"""Get the text of the article's original title.
Returns:
The title before machine translation.
"""
return self._title_original
def get_title_english(self) -> str:
"""Get the text of the article's title in English.
Returns:
The title after machine translation.
"""
return self._title_english
def get_published(self) -> str:
"""Get date or timestamp indicating when this article was published.
Returns:
ISO8601 string describing when the article was published.
"""
return self._published
def get_country(self) -> str:
"""Get the location where this article was published.
Returns:
Full human-readable name of the country in which the article was published.
"""
return self._country
def get_keywords(self) -> typing.List[str]:
"""Get the keywords assigned to this article by the topic model.
Returns:
List of keywords found in the article.
"""
return self._keywords
def get_tags(self) -> typing.List[str]:
"""Get the tags assigned to this article by the topic model.
Returns:
List of tags assigned to the article.
"""
return self._tags
def get_categories(self) -> typing.List[str]:
"""Get the categories assigned to this article by the topic model.
Returns:
List of categories that the article is part of.
"""
return self._categories
def to_dict(self) -> typing.Dict[str, str]:
"""Get a JSON serializable dictionary form of this object.
Returns:
Dictionary containing primitives.
"""
return {
'url': self.get_url(),
'published': self.get_published(),
'country': self.get_country(),
'tags': ' '.join(sorted(self.get_tags())),
'keywordList': ';'.join(self.get_keywords()),
'tagList': ';'.join(self.get_tags()),
'categoryList': ';'.join(self.get_categories())
}
class ArticleGetter:
"""Abstract base class for a strategy to query and filter articles."""
def execute_to_obj(self, params: typing.Dict) -> typing.Iterable[Article]:
"""Execute a query and return article objects.
Args:
params: Dictionary describing the query.
Returns:
Matching Article objects.
"""
query_params = self._get_query_params(params)
input_lines = self._get_source()
matching = self._execute_query(query_params, input_lines)
return matching
def execute_to_native(self, params: typing.Dict):
"""Execute a query and return a "native" format depending on getter.
Args:
params: Dictionary describing the query.
Returns:
Native version of the matching Article objects.
"""
return self._make_response(self.execute_to_obj(params))
def _parse_row(self, target_str: str) -> typing.Optional[Article]:
pieces = target_str.split('\t')
if len(pieces) != 8:
return None
return Article(
pieces[0],
pieces[1],
pieces[2],
pieces[3],
pieces[4],
pieces[5].split(';'),
pieces[6].split(';'),
pieces[7].split(';')
)
def _execute_query(self, query_params: typing.Dict[str, str],
input_lines: typing.Iterable[str]) -> typing.Iterable[Article]:
articles_with_none = map(lambda x: self._parse_row(x), input_lines)
articles = filter(lambda x: x is not None, articles_with_none) # type: ignore
if 'keyword' in query_params:
target_keyword = query_params['keyword']
articles = filter(
lambda x: target_keyword in x.get_keywords(), # type: ignore
articles
)
if 'tag' in query_params:
target_tag = query_params['tag']
articles = filter(lambda x: target_tag in x.get_tags(), articles) # type: ignore
if 'category' in query_params:
target_category = query_params['category']
articles = filter(
lambda x: target_category in x.get_categories(), # type: ignore
articles
)
if 'country' in query_params:
target_country = query_params['country']
articles = filter(
lambda x: x.get_country() == target_country, # type: ignore
articles
)
articles = filter(lambda x: x.get_url() != 'url', articles) # type: ignore
return articles # type: ignore
def _get_query_params(self, target: typing.Dict) -> typing.Dict:
raise RuntimeError('Use implementor.')
def _get_source(self) -> typing.Iterable[str]:
raise RuntimeError('Use implementor.')
def _make_response(self, matching: typing.Iterable[Article]) -> typing.Dict:
raise RuntimeError('Use implementor.')
class AwsLambdaArticleGetter(ArticleGetter):
"""Getter which queries for data from S3 and returns a Lambda HTTP response."""
def _get_query_params(self, target: typing.Dict) -> typing.Dict:
return target['queryStringParameters']
def _get_source(self) -> typing.Iterable[str]:
if not boto_available:
raise RuntimeError('Please install boto before lambda handler use.')
client = boto3.client('s3')
obj = client.get_object(Bucket=OBJ_BUCKET, Key=OBJ_PATH)
body = obj['Body']
stream_reader = codecs.getreader('utf-8')
return stream_reader(body)
def _make_response(self, matching: typing.Iterable[Article]):
csv_str = self._make_csv_str(matching)
res = {
'statusCode': 200,
'headers': {
'Content-Type': 'text/csv',
'Content-Disposition': 'attachment',
'filename': 'articles_export.csv',
'Access-Control-Allow-Origin': '*'
},
'body': csv_str
}
return res
def _make_csv_str(self, articles: typing.Iterable[Article]) -> str:
articles_dicts = map(lambda x: x.to_dict(), articles)
output_target = io.StringIO()
writer = csv.DictWriter(output_target, fieldnames=COLS, extrasaction='ignore')
writer.writeheader()
writer.writerows(articles_dicts)
return output_target.getvalue()
class LocalArticleGetter(ArticleGetter):
"""Getter which queries for articles from a file and returns Article objects."""
def _get_query_params(self, target: typing.Dict) -> typing.Dict:
return target
def _get_source(self) -> typing.Iterable[str]:
with open(os.path.join('csv', 'articles.csv')) as f:
lines = f.readlines()
return lines
def _make_response(self, matching: typing.Iterable[Article]):
return list(matching)
def lambda_handler(event, context):
"""Entrypoint / driver for Lambda-based execution.
Args:
event: Information about the Lambda event including query parameters.
context: Unused Lambda contextual information.
Returns:
Lambda compatible HTTP response.
"""
article_getter = AwsLambdaArticleGetter()
return article_getter.execute_to_native(event)
def local_handler(params: typing.Dict) -> typing.List[Article]:
"""Entrypoint / driver for visualization app-based execution.
Args:
params: The parameters of the query.
Returns:
List of Articles.
"""
article_getter = LocalArticleGetter()
return article_getter.execute_to_native(params) # type: ignore