From f2b71a88e3893c4bc0ac6733762c16dcf6f3c2db Mon Sep 17 00:00:00 2001 From: vloooo Date: Wed, 12 Dec 2018 10:12:06 +0200 Subject: [PATCH 1/4] konec --- main.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/main.py b/main.py index 99f91a7..e55bf81 100644 --- a/main.py +++ b/main.py @@ -1,13 +1,89 @@ #!/usr/bin/env python +import nltk +import re +import string +import json -# Your classes or methods go here: +def read_file(name): + """ функция для чтения из файла """ + with open(name) as fl: + return fl.readlines() -class Tool: - def my_method(self): - print("my_method called!") + +class LineHandler: + """ класс для выполнения задания по обработке""" + + def __init__(self, line): + self.inf_dict = {} + self.line = line # входная строка + self.inf_dict['body'] = '' # словарь содержащий требуемые поля + self.inf_dict["metadata"] = [] + self.inf_dict['body_tags'] = [] + self.inf_dict['orphan_tokens'] = [] + + def explore(self): + + dog_index = self.line.rfind('@') + 1 + self.inf_dict["metadata"].append(self.line[dog_index:-3]) # нахожу [@...] + + self.line = self.line[:dog_index-4] + self.inf_dict['body'] = self.line # заполняю body + clean_line = self.remove_words(r'[\$]\w+', self.line) # удаляю долларовые слова + + self.inf_dict['body_tags'] = re.findall(r'[\#\@](\w+)', clean_line) # зплняю body_tags очищаю строку + clean_line = self.remove_words(r'[\#\@](\w+)', clean_line) + + clean_line = self.find_urls(clean_line) + + self.find_trash(clean_line) + + return self.inf_dict + + def find_trash(self, clean_line): + """ поиск безсмысленых слов """ + tokens = clean_line.split(' ') + + clean_tokens = [] + for i in range(len(tokens)): # очстка от "" и пунктуации (односимвлн слова нам тоже не нужны) + if len(tokens[i]) < 2: + continue + while True: + if tokens[i][-1] in string.punctuation: + tokens[i] = tokens[i][:-1] + else: + clean_tokens.append(tokens[i]) + break + + clean_tokens = set(clean_tokens) + for i in clean_tokens: # проверка на смсл + if nltk.wsd.lesk(tokens, i) is None: + self.inf_dict['orphan_tokens'].append(i) + + def find_urls(self, clean_line): + """ отлавливание ЮРЛОВ """ + urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', self.line) + clean_line = self.remove_words('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', + clean_line) + + if len(urls): + self.inf_dict["metadata"].extend(urls) + return clean_line + + def remove_words(self, expr, line): + """ функция для очисти регулярочками """ + return re.sub(expr, '', line) if __name__ == "__main__": - t = Tool() - t.my_method() + + data = read_file('input.txt') # чтение из файла + + result = {'records': []} # словарь для сериализации + + for i in data: # обработка строк + hndlr = LineHandler(i) + result['records'].append(hndlr.explore()) + + with open('output.json', 'w') as file: # сереализация + json.dump(result, file) From 0261025bae86727e833124538924093027e7deaf Mon Sep 17 00:00:00 2001 From: vloooo Date: Wed, 12 Dec 2018 10:13:11 +0200 Subject: [PATCH 2/4] konec --- output.json | 1 + 1 file changed, 1 insertion(+) create mode 100644 output.json diff --git a/output.json b/output.json new file mode 100644 index 0000000..3b724fe --- /dev/null +++ b/output.json @@ -0,0 +1 @@ +{"records": [{"orphan_tokens": [], "body": "$ABBV why price is going down, despite good results?", "body_tags": [], "metadata": ["price"]}, {"orphan_tokens": ["that", "him", "the", "don&;t", "Kevin&;s", "for", "and", "of", "he&;s"], "body": "$MNKD New crop of message board folk the last month or so that don&#39;t know Kevin&#39;s BS and MO. Watch out for him he&#39;s a fake", "body_tags": ["39", "39", "39"], "metadata": ["crop"]}, {"orphan_tokens": ["that’s", "I&rsquo", "for"], "body": "$HMNY I’ not selling, that’s for sure!", "body_tags": [], "metadata": ["sell"]}, {"orphan_tokens": ["the", "10.2", "than", "YTD", "of", "20,708,057", "OCC"], "body": "Total OCC cleared volume yesterday was 10.2% higher than the YTD daily average of 20,708,057. More volume info: https://bit.ly/2MYWIxx", "body_tags": [], "metadata": ["volume", "https://bit.ly/2MYWIxx"]}, {"orphan_tokens": ["87.5", "2018-11-16", "for"], "body": "$CMA max pain is 87.5 for expiry 2018-11-16 Source: http://sweep.ly/maxpain.html", "body_tags": [], "metadata": ["source", "http://sweep.ly/maxpain.html"]}, {"orphan_tokens": ["your"], "body": "$AAPL Markets opened green so afternoon selloff likely! Hedge your bets", "body_tags": [], "metadata": ["selloff"]}, {"orphan_tokens": ["of", "with", "carefu"], "body": "$YINN with trumps record of flip flopping, be carefu", "body_tags": [], "metadata": ["'flip"]}, {"orphan_tokens": ["1.82", "11-02"], "body": "Bulls just notched a win. $SENS rallied 1.82% on 11-02. See peers at https://dividendbot.com?s=SENS", "body_tags": [], "metadata": ["bulls", "https://dividendbot.com?s=SENS"]}, {"orphan_tokens": ["average?I", "INPX", "long,of", "my", "with"], "body": "$INPX my 2nd rs with INPX.$50 average?I am long,of course", "body_tags": [], "metadata": ["longof"]}, {"orphan_tokens": ["12.06", "marketQ...but", "+4,41", ".CA"], "body": "$APH.CA its now 12.06 US +4,41% on marketQ...but I cant see it on any website or viewer", "body_tags": [], "metadata": ["website"]}, {"orphan_tokens": ["and"], "body": "$CRC classic pump and dump", "body_tags": [], "metadata": ["pump"]}, {"orphan_tokens": ["they", "to", "let’s"], "body": "$NDRA they may have a place but they are not going to replace MRI let’s be real", "body_tags": [], "metadata": ["MRI"]}, {"orphan_tokens": ["If", "-8right", "you&;re", "you"], "body": "If you bought $ABBV exactly a year ago, you&#39;re down -8right now: http://stockchoker.com/?s=ABBV&d=20171102&a=1000", "body_tags": ["39"], "metadata": ["ABBV", "http://stockchoker.com/?s=ABBV&d=20171102&a=1000"]}, {"orphan_tokens": ["the", "where", "to"], "body": "$CAPR Wish the stock had some good news now. Needs to get back where it was a couple months ago.", "body_tags": [], "metadata": ["stock"]}, {"orphan_tokens": ["the", "and", "FitReporting"], "body": "@esrouter Different context... Citron reporting acquisition vs. FitReporting profitability. Combine the two stories and $20!", "body_tags": ["esrouter"], "metadata": ["citron"]}, {"orphan_tokens": ["Our", "KeyCorp", "with", "of", "for"], "body": "#STAAnalystAlert for $BLL : KeyCorp Reiterates with a rating of Hold. Our own verdict is Strong Buy http://www.stocktargetadvisor.com/toprating", "body_tags": ["STAAnalystAlert"], "metadata": ["keycorp", "http://www.stocktargetadvisor.com/toprating"]}, {"orphan_tokens": ["the", "this", "of", "anything"], "body": "@MainecoonSlayer anything seems possible in this market, it seems the new generation of investors cant fathom the possibility of a loss", "body_tags": ["MainecoonSlayer"], "metadata": ["investors"]}, {"orphan_tokens": ["it’s", "they", "the", "godaddy", "investis", "to", "The"], "body": "@1hevychevy it’s redirecting. The site is still served on godaddy they just made choice to point the domain to a filing server, investis.", "body_tags": ["1hevychevy"], "metadata": ["godaddy"]}]} \ No newline at end of file From e069bd45e65c3b08300fef7c22b8671161a2ebdd Mon Sep 17 00:00:00 2001 From: vloooo Date: Wed, 12 Dec 2018 10:16:51 +0200 Subject: [PATCH 3/4] rename file --- main.py => main_ovs.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename main.py => main_ovs.py (100%) diff --git a/main.py b/main_ovs.py similarity index 100% rename from main.py rename to main_ovs.py From 4e733424050991475571aa211ef2419eabb0a78a Mon Sep 17 00:00:00 2001 From: vloooo Date: Mon, 17 Dec 2018 10:49:04 +0200 Subject: [PATCH 4/4] new PR --- main_ovs.py | 117 +++++++++++++++++++++++++++++----------------------- output.json | 2 +- 2 files changed, 67 insertions(+), 52 deletions(-) diff --git a/main_ovs.py b/main_ovs.py index e55bf81..a60bf0c 100644 --- a/main_ovs.py +++ b/main_ovs.py @@ -5,47 +5,36 @@ import json -def read_file(name): - """ функция для чтения из файла """ - with open(name) as fl: - return fl.readlines() +class FileReader: + @staticmethod + def read_file(name): + """ func to read from file """ + with open(name) as fl: + return fl.readlines() -class LineHandler: - """ класс для выполнения задания по обработке""" +class JsonWriter: + @staticmethod + def write_json(name, jsn): + with open(name, 'w') as file: # serialization + json.dump(jsn, file) - def __init__(self, line): - self.inf_dict = {} - self.line = line # входная строка - self.inf_dict['body'] = '' # словарь содержащий требуемые поля - self.inf_dict["metadata"] = [] - self.inf_dict['body_tags'] = [] - self.inf_dict['orphan_tokens'] = [] - - def explore(self): - - dog_index = self.line.rfind('@') + 1 - self.inf_dict["metadata"].append(self.line[dog_index:-3]) # нахожу [@...] - - self.line = self.line[:dog_index-4] - self.inf_dict['body'] = self.line # заполняю body - clean_line = self.remove_words(r'[\$]\w+', self.line) # удаляю долларовые слова - self.inf_dict['body_tags'] = re.findall(r'[\#\@](\w+)', clean_line) # зплняю body_tags очищаю строку - clean_line = self.remove_words(r'[\#\@](\w+)', clean_line) - - clean_line = self.find_urls(clean_line) - - self.find_trash(clean_line) +class RemoverUnwantedWords: + @staticmethod + def remove_words(expr, line): + """ func for cleaning by regex """ + return re.sub(expr, '', line) - return self.inf_dict - def find_trash(self, clean_line): - """ поиск безсмысленых слов """ +class CheckerForMeaning: + @staticmethod + def find_trash(clean_line, inf_dict): + """ find contextless word """ tokens = clean_line.split(' ') clean_tokens = [] - for i in range(len(tokens)): # очстка от "" и пунктуации (односимвлн слова нам тоже не нужны) + for i in range(len(tokens)): # removing punctuation and " " if len(tokens[i]) < 2: continue while True: @@ -56,34 +45,60 @@ def find_trash(self, clean_line): break clean_tokens = set(clean_tokens) - for i in clean_tokens: # проверка на смсл + for i in clean_tokens: # check for meaning if nltk.wsd.lesk(tokens, i) is None: - self.inf_dict['orphan_tokens'].append(i) + inf_dict['orphan_tokens'].append(i) + return inf_dict - def find_urls(self, clean_line): - """ отлавливание ЮРЛОВ """ - urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', self.line) - clean_line = self.remove_words('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', - clean_line) + +class UrlFinder: + @staticmethod + def find_urls(clean_line, raw_line, inf_dict): + """ find URL """ + urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', raw_line) + funk_for_clean = RemoverUnwantedWords.remove_words + clean_line = funk_for_clean('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', + clean_line) if len(urls): - self.inf_dict["metadata"].extend(urls) - return clean_line + inf_dict["metadata"].extend(urls) + return clean_line, inf_dict - def remove_words(self, expr, line): - """ функция для очисти регулярочками """ - return re.sub(expr, '', line) + +class TweetLineHandler: + """ class for performance extraction useful inf from tweet""" + + def __init__(self, line): + self.line = line + + def explore(self): + inf_dict = {'body': '', "metadata": [], 'body_tags': [], 'orphan_tokens': []} + + dog_index = self.line.rfind('@') + 1 + inf_dict["metadata"].append(self.line[dog_index:-3]) # find [@...] + + line = self.line[:dog_index - 4] + inf_dict['body'] = line + clean_line = RemoverUnwantedWords.remove_words(r'[\$]\w+', line) # remove $-words + + inf_dict['body_tags'] = re.findall(r'[\#\@](\w+)', clean_line) + clean_line = RemoverUnwantedWords.remove_words(r'[\#\@](\w+)', clean_line) # clean line + + clean_line, inf_dict = UrlFinder.find_urls(clean_line, self.line, inf_dict) + + inf_dict = CheckerForMeaning.find_trash(clean_line, inf_dict) + + return inf_dict if __name__ == "__main__": - data = read_file('input.txt') # чтение из файла + data = FileReader.read_file('input.txt') # reading from file - result = {'records': []} # словарь для сериализации + result = {'records': []} - for i in data: # обработка строк - hndlr = LineHandler(i) - result['records'].append(hndlr.explore()) + for i in data: # tweet handling + hndlr = TweetLineHandler(i) + result['records'].append(hndlr.explore) - with open('output.json', 'w') as file: # сереализация - json.dump(result, file) + JsonWriter.write_json('output.json', result) diff --git a/output.json b/output.json index 3b724fe..25d4c1e 100644 --- a/output.json +++ b/output.json @@ -1 +1 @@ -{"records": [{"orphan_tokens": [], "body": "$ABBV why price is going down, despite good results?", "body_tags": [], "metadata": ["price"]}, {"orphan_tokens": ["that", "him", "the", "don&;t", "Kevin&;s", "for", "and", "of", "he&;s"], "body": "$MNKD New crop of message board folk the last month or so that don&#39;t know Kevin&#39;s BS and MO. Watch out for him he&#39;s a fake", "body_tags": ["39", "39", "39"], "metadata": ["crop"]}, {"orphan_tokens": ["that’s", "I&rsquo", "for"], "body": "$HMNY I’ not selling, that’s for sure!", "body_tags": [], "metadata": ["sell"]}, {"orphan_tokens": ["the", "10.2", "than", "YTD", "of", "20,708,057", "OCC"], "body": "Total OCC cleared volume yesterday was 10.2% higher than the YTD daily average of 20,708,057. More volume info: https://bit.ly/2MYWIxx", "body_tags": [], "metadata": ["volume", "https://bit.ly/2MYWIxx"]}, {"orphan_tokens": ["87.5", "2018-11-16", "for"], "body": "$CMA max pain is 87.5 for expiry 2018-11-16 Source: http://sweep.ly/maxpain.html", "body_tags": [], "metadata": ["source", "http://sweep.ly/maxpain.html"]}, {"orphan_tokens": ["your"], "body": "$AAPL Markets opened green so afternoon selloff likely! Hedge your bets", "body_tags": [], "metadata": ["selloff"]}, {"orphan_tokens": ["of", "with", "carefu"], "body": "$YINN with trumps record of flip flopping, be carefu", "body_tags": [], "metadata": ["'flip"]}, {"orphan_tokens": ["1.82", "11-02"], "body": "Bulls just notched a win. $SENS rallied 1.82% on 11-02. See peers at https://dividendbot.com?s=SENS", "body_tags": [], "metadata": ["bulls", "https://dividendbot.com?s=SENS"]}, {"orphan_tokens": ["average?I", "INPX", "long,of", "my", "with"], "body": "$INPX my 2nd rs with INPX.$50 average?I am long,of course", "body_tags": [], "metadata": ["longof"]}, {"orphan_tokens": ["12.06", "marketQ...but", "+4,41", ".CA"], "body": "$APH.CA its now 12.06 US +4,41% on marketQ...but I cant see it on any website or viewer", "body_tags": [], "metadata": ["website"]}, {"orphan_tokens": ["and"], "body": "$CRC classic pump and dump", "body_tags": [], "metadata": ["pump"]}, {"orphan_tokens": ["they", "to", "let’s"], "body": "$NDRA they may have a place but they are not going to replace MRI let’s be real", "body_tags": [], "metadata": ["MRI"]}, {"orphan_tokens": ["If", "-8right", "you&;re", "you"], "body": "If you bought $ABBV exactly a year ago, you&#39;re down -8right now: http://stockchoker.com/?s=ABBV&d=20171102&a=1000", "body_tags": ["39"], "metadata": ["ABBV", "http://stockchoker.com/?s=ABBV&d=20171102&a=1000"]}, {"orphan_tokens": ["the", "where", "to"], "body": "$CAPR Wish the stock had some good news now. Needs to get back where it was a couple months ago.", "body_tags": [], "metadata": ["stock"]}, {"orphan_tokens": ["the", "and", "FitReporting"], "body": "@esrouter Different context... Citron reporting acquisition vs. FitReporting profitability. Combine the two stories and $20!", "body_tags": ["esrouter"], "metadata": ["citron"]}, {"orphan_tokens": ["Our", "KeyCorp", "with", "of", "for"], "body": "#STAAnalystAlert for $BLL : KeyCorp Reiterates with a rating of Hold. Our own verdict is Strong Buy http://www.stocktargetadvisor.com/toprating", "body_tags": ["STAAnalystAlert"], "metadata": ["keycorp", "http://www.stocktargetadvisor.com/toprating"]}, {"orphan_tokens": ["the", "this", "of", "anything"], "body": "@MainecoonSlayer anything seems possible in this market, it seems the new generation of investors cant fathom the possibility of a loss", "body_tags": ["MainecoonSlayer"], "metadata": ["investors"]}, {"orphan_tokens": ["it’s", "they", "the", "godaddy", "investis", "to", "The"], "body": "@1hevychevy it’s redirecting. The site is still served on godaddy they just made choice to point the domain to a filing server, investis.", "body_tags": ["1hevychevy"], "metadata": ["godaddy"]}]} \ No newline at end of file +{"records": [{"body": "$ABBV why price is going down, despite good results?", "body_tags": [], "orphan_tokens": [], "metadata": ["price"]}, {"body": "$MNKD New crop of message board folk the last month or so that don&#39;t know Kevin&#39;s BS and MO. Watch out for him he&#39;s a fake", "body_tags": ["39", "39", "39"], "orphan_tokens": ["for", "Kevin&;s", "him", "that", "don&;t", "of", "he&;s", "and", "the"], "metadata": ["crop"]}, {"body": "$HMNY I’ not selling, that’s for sure!", "body_tags": [], "orphan_tokens": ["for", "I&rsquo", "that’s"], "metadata": ["sell"]}, {"body": "Total OCC cleared volume yesterday was 10.2% higher than the YTD daily average of 20,708,057. More volume info: https://bit.ly/2MYWIxx", "body_tags": [], "orphan_tokens": ["of", "10.2", "the", "OCC", "20,708,057", "YTD", "than"], "metadata": ["volume", "https://bit.ly/2MYWIxx,['@volume']"]}, {"body": "$CMA max pain is 87.5 for expiry 2018-11-16 Source: http://sweep.ly/maxpain.html", "body_tags": [], "orphan_tokens": ["for", "2018-11-16", "87.5"], "metadata": ["source", "http://sweep.ly/maxpain.html,['@source']"]}, {"body": "$AAPL Markets opened green so afternoon selloff likely! Hedge your bets", "body_tags": [], "orphan_tokens": ["your"], "metadata": ["selloff"]}, {"body": "$YINN with trumps record of flip flopping, be carefu", "body_tags": [], "orphan_tokens": ["of", "carefu", "with"], "metadata": ["'flip"]}, {"body": "Bulls just notched a win. $SENS rallied 1.82% on 11-02. See peers at https://dividendbot.com?s=SENS", "body_tags": [], "orphan_tokens": ["1.82", "11-02"], "metadata": ["bulls", "https://dividendbot.com?s=SENS,['@bulls']"]}, {"body": "$INPX my 2nd rs with INPX.$50 average?I am long,of course", "body_tags": [], "orphan_tokens": ["INPX", "long,of", "my", "with", "average?I"], "metadata": ["longof"]}, {"body": "$APH.CA its now 12.06 US +4,41% on marketQ...but I cant see it on any website or viewer", "body_tags": [], "orphan_tokens": [".CA", "12.06", "+4,41", "marketQ...but"], "metadata": ["website"]}, {"body": "$CRC classic pump and dump", "body_tags": [], "orphan_tokens": ["and"], "metadata": ["pump"]}, {"body": "$NDRA they may have a place but they are not going to replace MRI let’s be real", "body_tags": [], "orphan_tokens": ["let’s", "they", "to"], "metadata": ["MRI"]}, {"body": "If you bought $ABBV exactly a year ago, you&#39;re down -8right now: http://stockchoker.com/?s=ABBV&d=20171102&a=1000", "body_tags": ["39"], "orphan_tokens": ["-8right", "you", "you&;re", "If"], "metadata": ["ABBV", "http://stockchoker.com/?s=ABBV&d=20171102&a=1000,['@ABBV']"]}, {"body": "$CAPR Wish the stock had some good news now. Needs to get back where it was a couple months ago.", "body_tags": [], "orphan_tokens": ["to", "where", "the"], "metadata": ["stock"]}, {"body": "@esrouter Different context... Citron reporting acquisition vs. FitReporting profitability. Combine the two stories and $20!", "body_tags": ["esrouter"], "orphan_tokens": ["the", "FitReporting", "and"], "metadata": ["citron"]}, {"body": "#STAAnalystAlert for $BLL : KeyCorp Reiterates with a rating of Hold. Our own verdict is Strong Buy http://www.stocktargetadvisor.com/toprating", "body_tags": ["STAAnalystAlert"], "orphan_tokens": ["of", "KeyCorp", "for", "Our", "with"], "metadata": ["keycorp", "http://www.stocktargetadvisor.com/toprating,['@keycorp']"]}, {"body": "@MainecoonSlayer anything seems possible in this market, it seems the new generation of investors cant fathom the possibility of a loss", "body_tags": ["MainecoonSlayer"], "orphan_tokens": ["of", "anything", "this", "the"], "metadata": ["investors"]}, {"body": "@1hevychevy it’s redirecting. The site is still served on godaddy they just made choice to point the domain to a filing server, investis.", "body_tags": ["1hevychevy"], "orphan_tokens": ["The", "it’s", "investis", "they", "the", "to", "godaddy"], "metadata": ["godaddy"]}]} \ No newline at end of file