diff --git a/.idea/code-style-and-composition-course-python.iml b/.idea/code-style-and-composition-course-python.iml
new file mode 100644
index 0000000..6711606
--- /dev/null
+++ b/.idea/code-style-and-composition-course-python.iml
@@ -0,0 +1,11 @@
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/encodings.xml b/.idea/encodings.xml
new file mode 100644
index 0000000..15a15b2
--- /dev/null
+++ b/.idea/encodings.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..65531ca
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..27d5593
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..59fd762
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,140 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1544634968644
+
+
+ 1544634968644
+
+
+ 1544635096161
+
+
+
+ 1544635096163
+
+
+ 1544635184072
+
+
+
+ 1544635184072
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/main.py b/main.py
index 99f91a7..14070ad 100644
--- a/main.py
+++ b/main.py
@@ -1,13 +1,102 @@
#!/usr/bin/env python
+import re
+from nltk.wsd import lesk
-# Your classes or methods go here:
+class JsonParsedWriter:
-class Tool:
- def my_method(self):
- print("my_method called!")
+ @staticmethod
+ def write(data, output_name, indent=2):
+ import json
+ with open(output_name, 'w') as out_file:
+ parsed = json.loads(json.dumps(data))
+ json.dump(parsed, out_file, indent=indent)
+
+
+class TweetCleaner:
+
+ def __init__(self, input_file_name):
+
+ self.name = input_file_name
+
+ # list of patterns for regular expressions
+ self.tags_patt = re.compile(r",\[.*\]")
+ self.clr_tags_patt = re.compile(r"\w+")
+ self.body_patt = re.compile(r"#\w+|@\w+")
+ self.clr_body_patt = re.compile(r"\w+")
+ self.url_patt = re.compile(r"https?://[A-Za-z0-9./]+")
+ self.sign_patt = re.compile(r"$\w+")
+ self.token_skip_patt = re.compile(r"#\w+|@\w+|\$\w+|https?://[A-Za-z0-9./]+|")
+ self.token_clr_patt = re.compile(r"[A-Za-z0-9.\-:]+")
+
+ def __clr_tags_to_metadata(self, row):
+ # task 1 and 2 for the tags array || and task 3 for the text
+ self.sign_result = re.findall(self.sign_patt, row)
+ self.tags_result = re.findall(self.tags_patt, row)
+ tags_clr = re.findall(self.clr_tags_patt, str(self.tags_result))
+ url_result = re.findall(self.url_patt, row)
+ self.metadata = tags_clr + url_result
+
+ def __remove_dollar_sign_words(self, row, tag_results):
+ # task 1 for the text and processing text
+ self.body = str()
+ for tag in tag_results:
+ self.body = row.replace(tag, '')
+
+ self.clr_row = self.body.replace("&", "&", ) \
+ .replace("'", "'").replace(""", '"').replace('’', '\'')
+
+ for sign in self.sign_result:
+ self.clr_row = self.clr_row.replace(sign, '')
+
+ def __place_tags_wrd_to_body_tags(self, clr_row):
+ # task 2 for the text
+ self.body_tags = re.findall(self.body_patt, clr_row)
+ self.body_tags = re.findall(self.clr_body_patt, str(self.body_tags))
+
+ def __tokenize_add_orphan_tokens(self, clr_row):
+ # task 4 for text
+ self.orphan_tokens = list()
+ tokens_clr = list()
+ clr_row = clr_row.split()
+
+ for token in clr_row:
+ token_skip = re.findall(self.token_skip_patt, token)
+ if len(max(token_skip, key=len)) == 0:
+ tokens_clr += re.findall(self.token_clr_patt, token)
+
+ for token in tokens_clr:
+ if lesk(tokens_clr, token) is None:
+ self.orphan_tokens.append(token)
+
+ def processing_text(self):
+
+ with open(self.name, encoding='utf-8') as data_file:
+ json_dict = dict()
+
+ # main loop for processing source file rows
+ for row in data_file:
+ row_dict = dict()
+
+ self.__clr_tags_to_metadata(row=row)
+ self.__remove_dollar_sign_words(row=row, tag_results=self.tags_result)
+ self.__place_tags_wrd_to_body_tags(clr_row=self.clr_row)
+ self.__tokenize_add_orphan_tokens(clr_row=self.clr_row)
+
+ # overall data save
+ row_dict["body"] = self.body
+ row_dict["body_tags"] = self.body_tags
+ row_dict["metadata"] = self.metadata
+ row_dict["orphan_tokens"] = self.orphan_tokens
+
+ json_dict.setdefault("records", []).append(row_dict)
+
+ return json_dict
if __name__ == "__main__":
- t = Tool()
- t.my_method()
+
+ tc = TweetCleaner(input_file_name='input.txt')
+ processed_text = tc.processing_text()
+ t = JsonParsedWriter()
+ t.write(processed_text, 'result.json')
diff --git a/sourse_udovenko.py b/sourse_udovenko.py
new file mode 100644
index 0000000..77d0c41
--- /dev/null
+++ b/sourse_udovenko.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+import re
+from nltk.wsd import lesk
+
+
+class JsonParsedWriter:
+
+ @staticmethod
+ def write(data, output_name, indent=2):
+ import json
+ with open(output_name, 'w') as out_file:
+ parsed = json.loads(json.dumps(data))
+ json.dump(parsed, out_file, indent=indent)
+
+
+class TweetCleaner:
+
+ def __init__(self, input_file_name):
+
+ self.name = input_file_name
+
+ # list of patterns for regular expressions
+ self.tags_patt = re.compile(r",\[.*\]")
+ self.clr_tags_patt = re.compile(r"\w+")
+ self.body_patt = re.compile(r"#\w+|@\w+")
+ self.clr_body_patt = re.compile(r"\w+")
+ self.url_patt = re.compile(r"https?://[A-Za-z0-9./]+")
+ self.sign_patt = re.compile(r"$\w+")
+ self.token_skip_patt = re.compile(r"#\w+|@\w+|\$\w+|https?://[A-Za-z0-9./]+|")
+ self.token_clr_patt = re.compile(r"[A-Za-z0-9.\-:]+")
+
+ def __clr_tags_to_metadata(self, row):
+ # task 1 and 2 for the tags array || and task 3 for the text
+ self.sign_result = re.findall(self.sign_patt, row)
+ self.tags_result = re.findall(self.tags_patt, row)
+ tags_clr = re.findall(self.clr_tags_patt, str(self.tags_result))
+ url_result = re.findall(self.url_patt, row)
+ self.metadata = tags_clr + url_result
+
+ def __remove_dollar_sign_words(self, row, tag_results):
+ # task 1 for the text and processing text
+ self.body = str()
+ for tag in tag_results:
+ self.body = row.replace(tag, '')
+
+ self.clr_row = self.body.replace("&", "&", ) \
+ .replace("'", "'").replace(""", '"').replace('’', '\'')
+
+ for sign in self.sign_result:
+ self.clr_row = self.clr_row.replace(sign, '')
+
+ def __place_tags_wrd_to_body_tags(self, clr_row):
+ # task 2 for the text
+ self.body_tags = re.findall(self.body_patt, clr_row)
+ self.body_tags = re.findall(self.clr_body_patt, str(self.body_tags))
+
+ def __tokenize_add_orphan_tokens(self, clr_row):
+ # task 4 for text
+ self.orphan_tokens = list()
+ tokens_clr = list()
+ clr_row = clr_row.split()
+
+ for token in clr_row:
+ token_skip = re.findall(self.token_skip_patt, token)
+ if len(max(token_skip, key=len)) == 0:
+ tokens_clr += re.findall(self.token_clr_patt, token)
+
+ for token in tokens_clr:
+ if lesk(tokens_clr, token) is None:
+ self.orphan_tokens.append(token)
+
+ def processing_text(self):
+
+ with open(self.name, encoding='utf-8') as data_file:
+ json_dict = dict()
+
+ # main loop for processing source file rows
+ for row in data_file:
+ row_dict = dict()
+
+ self.__clr_tags_to_metadata(row=row)
+ self.__remove_dollar_sign_words(row=row, tag_results=self.tags_result)
+ self.__place_tags_wrd_to_body_tags(clr_row=self.clr_row)
+ self.__tokenize_add_orphan_tokens(clr_row=self.clr_row)
+
+ # overall data save
+ row_dict["body"] = self.body
+ row_dict["body_tags"] = self.body_tags
+ row_dict["metadata"] = self.metadata
+ row_dict["orphan_tokens"] = self.orphan_tokens
+
+ json_dict.setdefault("records", []).append(row_dict)
+
+ return json_dict
+
+
+if __name__ == "__main__":
+ tc = TweetCleaner(input_file_name='input.txt')
+ processed_text = tc.processing_text()
+ t = JsonParsedWriter()
+ t.write(processed_text, 'result.json')