-
Notifications
You must be signed in to change notification settings - Fork 1
Udovenko_git_pain #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,102 @@ | ||
#!/usr/bin/env python | ||
import re | ||
from nltk.wsd import lesk | ||
|
||
# Your classes or methods go here: | ||
|
||
class JsonParsedWriter: | ||
|
||
class Tool: | ||
def my_method(self): | ||
print("my_method called!") | ||
@staticmethod | ||
def write(data, output_name, indent=2): | ||
import json | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid imports that are near to the place where they are used. Contrary to the local variables (that should be defined closer to the exact place where they are used), all imports should be at the top of your module. |
||
with open(output_name, 'w') as out_file: | ||
parsed = json.loads(json.dumps(data)) | ||
json.dump(parsed, out_file, indent=indent) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not just |
||
|
||
|
||
class TweetCleaner: | ||
|
||
def __init__(self, input_file_name): | ||
|
||
self.name = input_file_name | ||
|
||
# list of patterns for regular expressions | ||
self.tags_patt = re.compile(r",\[.*\]") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this is a good idea to pre-create all the data you need in your methods right here, in your |
||
self.clr_tags_patt = re.compile(r"\w+") | ||
self.body_patt = re.compile(r"#\w+|@\w+") | ||
self.clr_body_patt = re.compile(r"\w+") | ||
self.url_patt = re.compile(r"https?://[A-Za-z0-9./]+") | ||
self.sign_patt = re.compile(r"$\w+") | ||
self.token_skip_patt = re.compile(r"#\w+|@\w+|\$\w+|https?://[A-Za-z0-9./]+|") | ||
self.token_clr_patt = re.compile(r"[A-Za-z0-9.\-:]+") | ||
|
||
def __clr_tags_to_metadata(self, row): | ||
# task 1 and 2 for the tags array || and task 3 for the text | ||
self.sign_result = re.findall(self.sign_patt, row) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
self.tags_result = re.findall(self.tags_patt, row) | ||
tags_clr = re.findall(self.clr_tags_patt, str(self.tags_result)) | ||
url_result = re.findall(self.url_patt, row) | ||
self.metadata = tags_clr + url_result | ||
|
||
def __remove_dollar_sign_words(self, row, tag_results): | ||
# task 1 for the text and processing text | ||
self.body = str() | ||
for tag in tag_results: | ||
self.body = row.replace(tag, '') | ||
|
||
self.clr_row = self.body.replace("&", "&", ) \ | ||
.replace("'", "'").replace(""", '"').replace('’', '\'') | ||
|
||
for sign in self.sign_result: | ||
self.clr_row = self.clr_row.replace(sign, '') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, you use the internal state to store the value ( |
||
|
||
def __place_tags_wrd_to_body_tags(self, clr_row): | ||
# task 2 for the text | ||
self.body_tags = re.findall(self.body_patt, clr_row) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Writing into the internal state - this should be avoided. Make it the return value of the method. |
||
self.body_tags = re.findall(self.clr_body_patt, str(self.body_tags)) | ||
|
||
def __tokenize_add_orphan_tokens(self, clr_row): | ||
# task 4 for text | ||
self.orphan_tokens = list() | ||
tokens_clr = list() | ||
clr_row = clr_row.split() | ||
|
||
for token in clr_row: | ||
token_skip = re.findall(self.token_skip_patt, token) | ||
if len(max(token_skip, key=len)) == 0: | ||
tokens_clr += re.findall(self.token_clr_patt, token) | ||
|
||
for token in tokens_clr: | ||
if lesk(tokens_clr, token) is None: | ||
self.orphan_tokens.append(token) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Writing into the internal state - this should be avoided. Make it the return value of the method. |
||
|
||
def processing_text(self): | ||
|
||
with open(self.name, encoding='utf-8') as data_file: | ||
json_dict = dict() | ||
|
||
# main loop for processing source file rows | ||
for row in data_file: | ||
row_dict = dict() | ||
|
||
self.__clr_tags_to_metadata(row=row) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This piece of code should have been implemented in the following way: metadata, tags_result = self.__clr_tags_to_metadata(row) # don't be afraid of multiple return values, sometimes it's a good thing
clr_row = self.__remove_dollar_sign_words(row, tags_result)
body_tags = self.__place_tags_wrd_to_body_tags(clr_row)
orphan_tokens = self.__tokenize_add_orphan_tokens(clr_row)
row_dict["body"] = body # you could just get the `body` from `row` while reading the parsed file
row_dict["body_tags"] = body_tags
row_dict["metadata"] = metadata
row_dict["orphan_tokens"] = orphan_tokens Put this way, your program is just a pipe that transforms the data, each piece of this pipe is not connected to any other pieces with anything except its params, return value(s) and (seldom) reading the internal state. |
||
self.__remove_dollar_sign_words(row=row, tag_results=self.tags_result) | ||
self.__place_tags_wrd_to_body_tags(clr_row=self.clr_row) | ||
self.__tokenize_add_orphan_tokens(clr_row=self.clr_row) | ||
|
||
# overall data save | ||
row_dict["body"] = self.body | ||
row_dict["body_tags"] = self.body_tags | ||
row_dict["metadata"] = self.metadata | ||
row_dict["orphan_tokens"] = self.orphan_tokens | ||
|
||
json_dict.setdefault("records", []).append(row_dict) | ||
|
||
return json_dict | ||
|
||
|
||
if __name__ == "__main__": | ||
t = Tool() | ||
t.my_method() | ||
|
||
tc = TweetCleaner(input_file_name='input.txt') | ||
processed_text = tc.processing_text() | ||
t = JsonParsedWriter() | ||
t.write(processed_text, 'result.json') |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
#!/usr/bin/env python | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess, this file is obsolete and can be deleted. |
||
import re | ||
from nltk.wsd import lesk | ||
|
||
|
||
class JsonParsedWriter: | ||
|
||
@staticmethod | ||
def write(data, output_name, indent=2): | ||
import json | ||
with open(output_name, 'w') as out_file: | ||
parsed = json.loads(json.dumps(data)) | ||
json.dump(parsed, out_file, indent=indent) | ||
|
||
|
||
class TweetCleaner: | ||
|
||
def __init__(self, input_file_name): | ||
|
||
self.name = input_file_name | ||
|
||
# list of patterns for regular expressions | ||
self.tags_patt = re.compile(r",\[.*\]") | ||
self.clr_tags_patt = re.compile(r"\w+") | ||
self.body_patt = re.compile(r"#\w+|@\w+") | ||
self.clr_body_patt = re.compile(r"\w+") | ||
self.url_patt = re.compile(r"https?://[A-Za-z0-9./]+") | ||
self.sign_patt = re.compile(r"$\w+") | ||
self.token_skip_patt = re.compile(r"#\w+|@\w+|\$\w+|https?://[A-Za-z0-9./]+|") | ||
self.token_clr_patt = re.compile(r"[A-Za-z0-9.\-:]+") | ||
|
||
def __clr_tags_to_metadata(self, row): | ||
# task 1 and 2 for the tags array || and task 3 for the text | ||
self.sign_result = re.findall(self.sign_patt, row) | ||
self.tags_result = re.findall(self.tags_patt, row) | ||
tags_clr = re.findall(self.clr_tags_patt, str(self.tags_result)) | ||
url_result = re.findall(self.url_patt, row) | ||
self.metadata = tags_clr + url_result | ||
|
||
def __remove_dollar_sign_words(self, row, tag_results): | ||
# task 1 for the text and processing text | ||
self.body = str() | ||
for tag in tag_results: | ||
self.body = row.replace(tag, '') | ||
|
||
self.clr_row = self.body.replace("&", "&", ) \ | ||
.replace("'", "'").replace(""", '"').replace('’', '\'') | ||
|
||
for sign in self.sign_result: | ||
self.clr_row = self.clr_row.replace(sign, '') | ||
|
||
def __place_tags_wrd_to_body_tags(self, clr_row): | ||
# task 2 for the text | ||
self.body_tags = re.findall(self.body_patt, clr_row) | ||
self.body_tags = re.findall(self.clr_body_patt, str(self.body_tags)) | ||
|
||
def __tokenize_add_orphan_tokens(self, clr_row): | ||
# task 4 for text | ||
self.orphan_tokens = list() | ||
tokens_clr = list() | ||
clr_row = clr_row.split() | ||
|
||
for token in clr_row: | ||
token_skip = re.findall(self.token_skip_patt, token) | ||
if len(max(token_skip, key=len)) == 0: | ||
tokens_clr += re.findall(self.token_clr_patt, token) | ||
|
||
for token in tokens_clr: | ||
if lesk(tokens_clr, token) is None: | ||
self.orphan_tokens.append(token) | ||
|
||
def processing_text(self): | ||
|
||
with open(self.name, encoding='utf-8') as data_file: | ||
json_dict = dict() | ||
|
||
# main loop for processing source file rows | ||
for row in data_file: | ||
row_dict = dict() | ||
|
||
self.__clr_tags_to_metadata(row=row) | ||
self.__remove_dollar_sign_words(row=row, tag_results=self.tags_result) | ||
self.__place_tags_wrd_to_body_tags(clr_row=self.clr_row) | ||
self.__tokenize_add_orphan_tokens(clr_row=self.clr_row) | ||
|
||
# overall data save | ||
row_dict["body"] = self.body | ||
row_dict["body_tags"] = self.body_tags | ||
row_dict["metadata"] = self.metadata | ||
row_dict["orphan_tokens"] = self.orphan_tokens | ||
|
||
json_dict.setdefault("records", []).append(row_dict) | ||
|
||
return json_dict | ||
|
||
|
||
if __name__ == "__main__": | ||
tc = TweetCleaner(input_file_name='input.txt') | ||
processed_text = tc.processing_text() | ||
t = JsonParsedWriter() | ||
t.write(processed_text, 'result.json') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
out_filename
could be a better parameter name thanoutput_name
.indent
param.