From 9604c0bf385aaf354291f20156a94c841eee66c2 Mon Sep 17 00:00:00 2001 From: UdovenkoVolodymyr Date: Wed, 12 Dec 2018 19:05:09 +0200 Subject: [PATCH 1/4] homework done --- sourse_udovenko.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 sourse_udovenko.py diff --git a/sourse_udovenko.py b/sourse_udovenko.py new file mode 100644 index 0000000..82224b1 --- /dev/null +++ b/sourse_udovenko.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +import json +import re +from nltk.wsd import lesk + + +class TextProcessor: + + def processing_text_to_json(self, input_file, output_file): + + with open(input_file, encoding='utf-8') as data_file: + json_dict = dict() + + # list of patterns for regular expressions + tags_patt = re.compile(r",\[.*\]") + clr_tags_patt = re.compile(r"\w+") + body_patt = re.compile(r"#\w+|@\w+") + clr_body_patt = re.compile(r"\w+") + url_patt = re.compile(r"https?://[A-Za-z0-9./]+") + sign_patt = re.compile(r"$\w+") + token_skip_patt = re.compile(r"#\w+|@\w+|\$\w+|https?://[A-Za-z0-9./]+|") + token_clr_patt = re.compile(r"[A-Za-z0-9.\-:]+") + + # main loop for processing source file rows + for row in data_file: + # instances filled in loops + row_dict = dict() + body = str() + tokens_clr = list() + orphan_tokens = list() + + # task 1 and 2 for the tags array || and task 3 for the text + sign_result = re.findall(sign_patt, row) + tags_result = re.findall(tags_patt, row) + tags_clr = re.findall(clr_tags_patt, str(tags_result)) + url_result = re.findall(url_patt, row) + metadata = tags_clr + url_result + + # task 1 for the text and processing text + for tag in tags_result: + body = row.replace(tag, '') + + clr_row = body.replace("&", "&", ) \ + .replace("'", "'").replace(""", '"').replace('’', '\'') + + for sign in sign_result: + clr_row = clr_row.replace(sign, '') + + # task 2 for the text + body_tags = re.findall(body_patt, clr_row) + body_tags = re.findall(clr_body_patt, str(body_tags)) + + # task 4 for text + clr_row = clr_row.split() + + for token in clr_row: + token_skip = re.findall(token_skip_patt, token) + if len(max(token_skip, key=len)) == 0: + tokens_clr += re.findall(token_clr_patt, token) + + for token in tokens_clr: + if lesk(tokens_clr, token) is None: + orphan_tokens.append(token) + + # overall data save + row_dict["body"] = body + row_dict["body_tags"] = body_tags + row_dict["metadata"] = metadata + row_dict["orphan_tokens"] = orphan_tokens + + json_dict.setdefault("records", []).append(row_dict) + + with open(output_file, 'w') as out_file: + parsed = json.loads(json.dumps(json_dict)) + json.dump(parsed, out_file, indent=2) + + +if __name__ == "__main__": + t = TextProcessor() + t.processing_text_to_json(input_file='input.txt', output_file='output.json') From 43b792212f79b65387de71db75fc72ec1c51321e Mon Sep 17 00:00:00 2001 From: UdovenkoVolodymyr Date: Wed, 12 Dec 2018 19:35:40 +0200 Subject: [PATCH 2/4] Homework is done master branch --- ...de-style-and-composition-course-python.iml | 11 ++ .idea/encodings.xml | 4 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + .idea/workspace.xml | 140 ++++++++++++++++++ sourse_udovenko.py | 1 + 7 files changed, 174 insertions(+) create mode 100644 .idea/code-style-and-composition-course-python.iml create mode 100644 .idea/encodings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 .idea/workspace.xml diff --git a/.idea/code-style-and-composition-course-python.iml b/.idea/code-style-and-composition-course-python.iml new file mode 100644 index 0000000..6711606 --- /dev/null +++ b/.idea/code-style-and-composition-course-python.iml @@ -0,0 +1,11 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..15a15b2 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..65531ca --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..27d5593 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..59fd762 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,140 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +