-
Notifications
You must be signed in to change notification settings - Fork 1
Ya vse. #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Ya vse. #1
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
#!/usr/bin/env python | ||
import nltk | ||
import re | ||
import string | ||
import json | ||
|
||
|
||
class FileReader: | ||
@staticmethod | ||
def read_file(name): | ||
""" func to read from file """ | ||
with open(name) as fl: | ||
return fl.readlines() | ||
|
||
|
||
class JsonWriter: | ||
@staticmethod | ||
def write_json(name, jsn): | ||
with open(name, 'w') as file: # serialization | ||
json.dump(jsn, file) | ||
|
||
|
||
class RemoverUnwantedWords: | ||
@staticmethod | ||
def remove_words(expr, line): | ||
""" func for cleaning by regex """ | ||
return re.sub(expr, '', line) | ||
|
||
|
||
class CheckerForMeaning: | ||
@staticmethod | ||
def find_trash(clean_line, inf_dict): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea to mutate the data passed as an argument is as bad as mutating the internal state. That means, instead of doing def find_trash(clean_line, inf_dict):
# ...
inf_dict['orphan_tokens'].append(i)
# ... a much better strategy could be def find_trash(clean_line):
# ...
return clean_tokens, orphan_tokens
# somewhere higher the call stack
clean_tokens, orphan_tokens = find_trash
response["orphan_tokens"] = orphan_tokens Don't ever mutate the data passes to your methods as an argument. |
||
""" find contextless word """ | ||
tokens = clean_line.split(' ') | ||
|
||
clean_tokens = [] | ||
for i in range(len(tokens)): # removing punctuation and " " | ||
if len(tokens[i]) < 2: | ||
continue | ||
while True: | ||
if tokens[i][-1] in string.punctuation: | ||
tokens[i] = tokens[i][:-1] | ||
else: | ||
clean_tokens.append(tokens[i]) | ||
break | ||
|
||
clean_tokens = set(clean_tokens) | ||
for i in clean_tokens: # check for meaning | ||
if nltk.wsd.lesk(tokens, i) is None: | ||
inf_dict['orphan_tokens'].append(i) | ||
return inf_dict | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you absolutely have to mutate the data sent you as an argument, don't return this very data from your method - your method should either have an "out" parameter to mutate, or a return value, not both. Having both, you confuse a reader of your method. |
||
|
||
|
||
class UrlFinder: | ||
@staticmethod | ||
def find_urls(clean_line, raw_line, inf_dict): | ||
""" find URL """ | ||
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', raw_line) | ||
funk_for_clean = RemoverUnwantedWords.remove_words | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
clean_line = funk_for_clean('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', | ||
clean_line) | ||
|
||
if len(urls): | ||
inf_dict["metadata"].extend(urls) | ||
return clean_line, inf_dict | ||
|
||
|
||
class TweetLineHandler: | ||
""" class for performance extraction useful inf from tweet""" | ||
|
||
def __init__(self, line): | ||
self.line = line | ||
|
||
def explore(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The responsibilities distribution between lines is done well enough. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And the idea to get parts of the response data and merge them in a single place is also a very good way to go. Just remove any mutations of the input data in your methods (see my comment above). |
||
inf_dict = {'body': '', "metadata": [], 'body_tags': [], 'orphan_tokens': []} | ||
|
||
dog_index = self.line.rfind('@') + 1 | ||
inf_dict["metadata"].append(self.line[dog_index:-3]) # find [@...] | ||
|
||
line = self.line[:dog_index - 4] | ||
inf_dict['body'] = line | ||
clean_line = RemoverUnwantedWords.remove_words(r'[\$]\w+', line) # remove $-words | ||
|
||
inf_dict['body_tags'] = re.findall(r'[\#\@](\w+)', clean_line) | ||
clean_line = RemoverUnwantedWords.remove_words(r'[\#\@](\w+)', clean_line) # clean line | ||
|
||
clean_line, inf_dict = UrlFinder.find_urls(clean_line, self.line, inf_dict) | ||
|
||
inf_dict = CheckerForMeaning.find_trash(clean_line, inf_dict) | ||
|
||
return inf_dict | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
data = FileReader.read_file('input.txt') # reading from file | ||
|
||
result = {'records': []} | ||
|
||
for i in data: # tweet handling | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
hndlr = TweetLineHandler(i) | ||
result['records'].append(hndlr.explore) | ||
|
||
JsonWriter.write_json('output.json', result) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"records": [{"body": "$ABBV why price is going down, despite good results?", "body_tags": [], "orphan_tokens": [], "metadata": ["price"]}, {"body": "$MNKD New crop of message board folk the last month or so that don&#39;t know Kevin&#39;s BS and MO. Watch out for him he&#39;s a fake", "body_tags": ["39", "39", "39"], "orphan_tokens": ["for", "Kevin&;s", "him", "that", "don&;t", "of", "he&;s", "and", "the"], "metadata": ["crop"]}, {"body": "$HMNY I’ not selling, that’s for sure!", "body_tags": [], "orphan_tokens": ["for", "I&rsquo", "that’s"], "metadata": ["sell"]}, {"body": "Total OCC cleared volume yesterday was 10.2% higher than the YTD daily average of 20,708,057. More volume info: https://bit.ly/2MYWIxx", "body_tags": [], "orphan_tokens": ["of", "10.2", "the", "OCC", "20,708,057", "YTD", "than"], "metadata": ["volume", "https://bit.ly/2MYWIxx,['@volume']"]}, {"body": "$CMA max pain is 87.5 for expiry 2018-11-16 Source: http://sweep.ly/maxpain.html", "body_tags": [], "orphan_tokens": ["for", "2018-11-16", "87.5"], "metadata": ["source", "http://sweep.ly/maxpain.html,['@source']"]}, {"body": "$AAPL Markets opened green so afternoon selloff likely! Hedge your bets", "body_tags": [], "orphan_tokens": ["your"], "metadata": ["selloff"]}, {"body": "$YINN with trumps record of flip flopping, be carefu", "body_tags": [], "orphan_tokens": ["of", "carefu", "with"], "metadata": ["'flip"]}, {"body": "Bulls just notched a win. $SENS rallied 1.82% on 11-02. See peers at https://dividendbot.com?s=SENS", "body_tags": [], "orphan_tokens": ["1.82", "11-02"], "metadata": ["bulls", "https://dividendbot.com?s=SENS,['@bulls']"]}, {"body": "$INPX my 2nd rs with INPX.$50 average?I am long,of course", "body_tags": [], "orphan_tokens": ["INPX", "long,of", "my", "with", "average?I"], "metadata": ["longof"]}, {"body": "$APH.CA its now 12.06 US +4,41% on marketQ...but I cant see it on any website or viewer", "body_tags": [], "orphan_tokens": [".CA", "12.06", "+4,41", "marketQ...but"], "metadata": ["website"]}, {"body": "$CRC classic pump and dump", "body_tags": [], "orphan_tokens": ["and"], "metadata": ["pump"]}, {"body": "$NDRA they may have a place but they are not going to replace MRI let’s be real", "body_tags": [], "orphan_tokens": ["let’s", "they", "to"], "metadata": ["MRI"]}, {"body": "If you bought $ABBV exactly a year ago, you&#39;re down -8right now: http://stockchoker.com/?s=ABBV&d=20171102&a=1000", "body_tags": ["39"], "orphan_tokens": ["-8right", "you", "you&;re", "If"], "metadata": ["ABBV", "http://stockchoker.com/?s=ABBV&d=20171102&a=1000,['@ABBV']"]}, {"body": "$CAPR Wish the stock had some good news now. Needs to get back where it was a couple months ago.", "body_tags": [], "orphan_tokens": ["to", "where", "the"], "metadata": ["stock"]}, {"body": "@esrouter Different context... Citron reporting acquisition vs. FitReporting profitability. Combine the two stories and $20!", "body_tags": ["esrouter"], "orphan_tokens": ["the", "FitReporting", "and"], "metadata": ["citron"]}, {"body": "#STAAnalystAlert for $BLL : KeyCorp Reiterates with a rating of Hold. Our own verdict is Strong Buy http://www.stocktargetadvisor.com/toprating", "body_tags": ["STAAnalystAlert"], "orphan_tokens": ["of", "KeyCorp", "for", "Our", "with"], "metadata": ["keycorp", "http://www.stocktargetadvisor.com/toprating,['@keycorp']"]}, {"body": "@MainecoonSlayer anything seems possible in this market, it seems the new generation of investors cant fathom the possibility of a loss", "body_tags": ["MainecoonSlayer"], "orphan_tokens": ["of", "anything", "this", "the"], "metadata": ["investors"]}, {"body": "@1hevychevy it’s redirecting. The site is still served on godaddy they just made choice to point the domain to a filing server, investis.", "body_tags": ["1hevychevy"], "orphan_tokens": ["The", "it’s", "investis", "they", "the", "to", "godaddy"], "metadata": ["godaddy"]}]} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This class is very good in terms of SRP, but this comment is redundant - the code is pretty self-documenting.