SlobodaStudio · UdovenkoVolodymyr · Dec 12, 2018 · Dec 12, 2018 · Dec 13, 2018 · Dec 13, 2018
diff --git a/.idea/code-style-and-composition-course-python.iml b/.idea/code-style-and-composition-course-python.iml
diff --git a/.idea/encodings.xml b/.idea/encodings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/main.py b/main.py
@@ -1,13 +1,102 @@
 #!/usr/bin/env python
+import re
+from nltk.wsd import lesk
 
-# Your classes or methods go here:
 
+class JsonParsedWriter:
 
-class Tool:
-    def my_method(self):
-        print("my_method called!")
+    @staticmethod
+    def write(data, output_name, indent=2):
+        import json
+        with open(output_name, 'w') as out_file:
+            parsed = json.loads(json.dumps(data))
+            json.dump(parsed, out_file, indent=indent)
+
+
+class TweetCleaner:
+
+    def __init__(self, input_file_name):
+
+        self.name = input_file_name
+
+        # list of patterns for regular expressions
+        self.tags_patt = re.compile(r",\[.*\]")
+        self.clr_tags_patt = re.compile(r"\w+")
+        self.body_patt = re.compile(r"#\w+|@\w+")
+        self.clr_body_patt = re.compile(r"\w+")
+        self.url_patt = re.compile(r"https?://[A-Za-z0-9./]+")
+        self.sign_patt = re.compile(r"$\w+")
+        self.token_skip_patt = re.compile(r"#\w+|@\w+|\$\w+|https?://[A-Za-z0-9./]+|")
+        self.token_clr_patt = re.compile(r"[A-Za-z0-9.\-:]+")
+
+    def __clr_tags_to_metadata(self, row):
+        # task 1 and 2  for the tags array || and task 3 for the text
+        self.sign_result = re.findall(self.sign_patt, row)
+        self.tags_result = re.findall(self.tags_patt, row)
+        tags_clr = re.findall(self.clr_tags_patt, str(self.tags_result))
+        url_result = re.findall(self.url_patt, row)
+        self.metadata = tags_clr + url_result
+
+    def __remove_dollar_sign_words(self, row, tag_results):
+        # task 1 for the text and processing text
+        self.body = str()
+        for tag in tag_results:
+            self.body = row.replace(tag, '')
+
+        self.clr_row = self.body.replace("&amp;", "&", ) \
+            .replace("&#39;", "'").replace("&quot;", '"').replace('&rsquo;', '\'')
+
+        for sign in self.sign_result:
+            self.clr_row = self.clr_row.replace(sign, '')
+
+    def __place_tags_wrd_to_body_tags(self, clr_row):
+        # task 2 for the text
+        self.body_tags = re.findall(self.body_patt, clr_row)
+        self.body_tags = re.findall(self.clr_body_patt, str(self.body_tags))
+
+    def __tokenize_add_orphan_tokens(self, clr_row):
+        # task 4 for text
+        self.orphan_tokens = list()
+        tokens_clr = list()
+        clr_row = clr_row.split()
+
+        for token in clr_row:
+            token_skip = re.findall(self.token_skip_patt, token)
+            if len(max(token_skip, key=len)) == 0:
+                tokens_clr += re.findall(self.token_clr_patt, token)
+
+        for token in tokens_clr:
+            if lesk(tokens_clr, token) is None:
+                self.orphan_tokens.append(token)
+
+    def processing_text(self):
+
+        with open(self.name, encoding='utf-8') as data_file:
+            json_dict = dict()
+
+            # main loop for processing source file rows
+            for row in data_file:
+                row_dict = dict()
+
+                self.__clr_tags_to_metadata(row=row)
+                self.__remove_dollar_sign_words(row=row, tag_results=self.tags_result)
+                self.__place_tags_wrd_to_body_tags(clr_row=self.clr_row)
+                self.__tokenize_add_orphan_tokens(clr_row=self.clr_row)
+
+                # overall data save
+                row_dict["body"] = self.body
+                row_dict["body_tags"] = self.body_tags
+                row_dict["metadata"] = self.metadata
+                row_dict["orphan_tokens"] = self.orphan_tokens
+
+                json_dict.setdefault("records", []).append(row_dict)
+
+        return json_dict
 
 
 if __name__ == "__main__":
-    t = Tool()
-    t.my_method()
+
+    tc = TweetCleaner(input_file_name='input.txt')
+    processed_text = tc.processing_text()
+    t = JsonParsedWriter()
+    t.write(processed_text, 'result.json')
diff --git a/sourse_udovenko.py b/sourse_udovenko.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+import re
+from nltk.wsd import lesk
+
+
+class JsonParsedWriter:
+
+    @staticmethod
+    def write(data, output_name, indent=2):
+        import json
+        with open(output_name, 'w') as out_file:
+            parsed = json.loads(json.dumps(data))
+            json.dump(parsed, out_file, indent=indent)
+
+
+class TweetCleaner:
+
+    def __init__(self, input_file_name):
+
+        self.name = input_file_name
+
+        # list of patterns for regular expressions
+        self.tags_patt = re.compile(r",\[.*\]")
+        self.clr_tags_patt = re.compile(r"\w+")
+        self.body_patt = re.compile(r"#\w+|@\w+")
+        self.clr_body_patt = re.compile(r"\w+")
+        self.url_patt = re.compile(r"https?://[A-Za-z0-9./]+")
+        self.sign_patt = re.compile(r"$\w+")
+        self.token_skip_patt = re.compile(r"#\w+|@\w+|\$\w+|https?://[A-Za-z0-9./]+|")
+        self.token_clr_patt = re.compile(r"[A-Za-z0-9.\-:]+")
+
+    def __clr_tags_to_metadata(self, row):
+        # task 1 and 2  for the tags array || and task 3 for the text
+        self.sign_result = re.findall(self.sign_patt, row)
+        self.tags_result = re.findall(self.tags_patt, row)
+        tags_clr = re.findall(self.clr_tags_patt, str(self.tags_result))
+        url_result = re.findall(self.url_patt, row)
+        self.metadata = tags_clr + url_result
+
+    def __remove_dollar_sign_words(self, row, tag_results):
+        # task 1 for the text and processing text
+        self.body = str()
+        for tag in tag_results:
+            self.body = row.replace(tag, '')
+
+        self.clr_row = self.body.replace("&amp;", "&", ) \
+            .replace("&#39;", "'").replace("&quot;", '"').replace('&rsquo;', '\'')
+
+        for sign in self.sign_result:
+            self.clr_row = self.clr_row.replace(sign, '')
+
+    def __place_tags_wrd_to_body_tags(self, clr_row):
+        # task 2 for the text
+        self.body_tags = re.findall(self.body_patt, clr_row)
+        self.body_tags = re.findall(self.clr_body_patt, str(self.body_tags))
+
+    def __tokenize_add_orphan_tokens(self, clr_row):
+        # task 4 for text
+        self.orphan_tokens = list()
+        tokens_clr = list()
+        clr_row = clr_row.split()
+
+        for token in clr_row:
+            token_skip = re.findall(self.token_skip_patt, token)
+            if len(max(token_skip, key=len)) == 0:
+                tokens_clr += re.findall(self.token_clr_patt, token)
+
+        for token in tokens_clr:
+            if lesk(tokens_clr, token) is None:
+                self.orphan_tokens.append(token)
+
+    def processing_text(self):
+
+        with open(self.name, encoding='utf-8') as data_file:
+            json_dict = dict()
+
+            # main loop for processing source file rows
+            for row in data_file:
+                row_dict = dict()
+
+                self.__clr_tags_to_metadata(row=row)
+                self.__remove_dollar_sign_words(row=row, tag_results=self.tags_result)
+                self.__place_tags_wrd_to_body_tags(clr_row=self.clr_row)
+                self.__tokenize_add_orphan_tokens(clr_row=self.clr_row)
+
+                # overall data save
+                row_dict["body"] = self.body
+                row_dict["body_tags"] = self.body_tags
+                row_dict["metadata"] = self.metadata
+                row_dict["orphan_tokens"] = self.orphan_tokens
+
+                json_dict.setdefault("records", []).append(row_dict)
+
+        return json_dict
+
+
+if __name__ == "__main__":
+    tc = TweetCleaner(input_file_name='input.txt')
+    processed_text = tc.processing_text()
+    t = JsonParsedWriter()
+    t.write(processed_text, 'result.json')