diff --git a/main.py b/main.py deleted file mode 100644 index 99f91a7..0000000 --- a/main.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python - -# Your classes or methods go here: - - -class Tool: - def my_method(self): - print("my_method called!") - - -if __name__ == "__main__": - t = Tool() - t.my_method() diff --git a/main_ovs.py b/main_ovs.py new file mode 100644 index 0000000..a60bf0c --- /dev/null +++ b/main_ovs.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +import nltk +import re +import string +import json + + +class FileReader: + @staticmethod + def read_file(name): + """ func to read from file """ + with open(name) as fl: + return fl.readlines() + + +class JsonWriter: + @staticmethod + def write_json(name, jsn): + with open(name, 'w') as file: # serialization + json.dump(jsn, file) + + +class RemoverUnwantedWords: + @staticmethod + def remove_words(expr, line): + """ func for cleaning by regex """ + return re.sub(expr, '', line) + + +class CheckerForMeaning: + @staticmethod + def find_trash(clean_line, inf_dict): + """ find contextless word """ + tokens = clean_line.split(' ') + + clean_tokens = [] + for i in range(len(tokens)): # removing punctuation and " " + if len(tokens[i]) < 2: + continue + while True: + if tokens[i][-1] in string.punctuation: + tokens[i] = tokens[i][:-1] + else: + clean_tokens.append(tokens[i]) + break + + clean_tokens = set(clean_tokens) + for i in clean_tokens: # check for meaning + if nltk.wsd.lesk(tokens, i) is None: + inf_dict['orphan_tokens'].append(i) + return inf_dict + + +class UrlFinder: + @staticmethod + def find_urls(clean_line, raw_line, inf_dict): + """ find URL """ + urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', raw_line) + funk_for_clean = RemoverUnwantedWords.remove_words + clean_line = funk_for_clean('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', + clean_line) + + if len(urls): + inf_dict["metadata"].extend(urls) + return clean_line, inf_dict + + +class TweetLineHandler: + """ class for performance extraction useful inf from tweet""" + + def __init__(self, line): + self.line = line + + def explore(self): + inf_dict = {'body': '', "metadata": [], 'body_tags': [], 'orphan_tokens': []} + + dog_index = self.line.rfind('@') + 1 + inf_dict["metadata"].append(self.line[dog_index:-3]) # find [@...] + + line = self.line[:dog_index - 4] + inf_dict['body'] = line + clean_line = RemoverUnwantedWords.remove_words(r'[\$]\w+', line) # remove $-words + + inf_dict['body_tags'] = re.findall(r'[\#\@](\w+)', clean_line) + clean_line = RemoverUnwantedWords.remove_words(r'[\#\@](\w+)', clean_line) # clean line + + clean_line, inf_dict = UrlFinder.find_urls(clean_line, self.line, inf_dict) + + inf_dict = CheckerForMeaning.find_trash(clean_line, inf_dict) + + return inf_dict + + +if __name__ == "__main__": + + data = FileReader.read_file('input.txt') # reading from file + + result = {'records': []} + + for i in data: # tweet handling + hndlr = TweetLineHandler(i) + result['records'].append(hndlr.explore) + + JsonWriter.write_json('output.json', result) diff --git a/output.json b/output.json new file mode 100644 index 0000000..25d4c1e --- /dev/null +++ b/output.json @@ -0,0 +1 @@ +{"records": [{"body": "$ABBV why price is going down, despite good results?", "body_tags": [], "orphan_tokens": [], "metadata": ["price"]}, {"body": "$MNKD New crop of message board folk the last month or so that don&#39;t know Kevin&#39;s BS and MO. Watch out for him he&#39;s a fake", "body_tags": ["39", "39", "39"], "orphan_tokens": ["for", "Kevin&;s", "him", "that", "don&;t", "of", "he&;s", "and", "the"], "metadata": ["crop"]}, {"body": "$HMNY I’ not selling, that’s for sure!", "body_tags": [], "orphan_tokens": ["for", "I&rsquo", "that’s"], "metadata": ["sell"]}, {"body": "Total OCC cleared volume yesterday was 10.2% higher than the YTD daily average of 20,708,057. More volume info: https://bit.ly/2MYWIxx", "body_tags": [], "orphan_tokens": ["of", "10.2", "the", "OCC", "20,708,057", "YTD", "than"], "metadata": ["volume", "https://bit.ly/2MYWIxx,['@volume']"]}, {"body": "$CMA max pain is 87.5 for expiry 2018-11-16 Source: http://sweep.ly/maxpain.html", "body_tags": [], "orphan_tokens": ["for", "2018-11-16", "87.5"], "metadata": ["source", "http://sweep.ly/maxpain.html,['@source']"]}, {"body": "$AAPL Markets opened green so afternoon selloff likely! Hedge your bets", "body_tags": [], "orphan_tokens": ["your"], "metadata": ["selloff"]}, {"body": "$YINN with trumps record of flip flopping, be carefu", "body_tags": [], "orphan_tokens": ["of", "carefu", "with"], "metadata": ["'flip"]}, {"body": "Bulls just notched a win. $SENS rallied 1.82% on 11-02. See peers at https://dividendbot.com?s=SENS", "body_tags": [], "orphan_tokens": ["1.82", "11-02"], "metadata": ["bulls", "https://dividendbot.com?s=SENS,['@bulls']"]}, {"body": "$INPX my 2nd rs with INPX.$50 average?I am long,of course", "body_tags": [], "orphan_tokens": ["INPX", "long,of", "my", "with", "average?I"], "metadata": ["longof"]}, {"body": "$APH.CA its now 12.06 US +4,41% on marketQ...but I cant see it on any website or viewer", "body_tags": [], "orphan_tokens": [".CA", "12.06", "+4,41", "marketQ...but"], "metadata": ["website"]}, {"body": "$CRC classic pump and dump", "body_tags": [], "orphan_tokens": ["and"], "metadata": ["pump"]}, {"body": "$NDRA they may have a place but they are not going to replace MRI let’s be real", "body_tags": [], "orphan_tokens": ["let’s", "they", "to"], "metadata": ["MRI"]}, {"body": "If you bought $ABBV exactly a year ago, you&#39;re down -8right now: http://stockchoker.com/?s=ABBV&d=20171102&a=1000", "body_tags": ["39"], "orphan_tokens": ["-8right", "you", "you&;re", "If"], "metadata": ["ABBV", "http://stockchoker.com/?s=ABBV&d=20171102&a=1000,['@ABBV']"]}, {"body": "$CAPR Wish the stock had some good news now. Needs to get back where it was a couple months ago.", "body_tags": [], "orphan_tokens": ["to", "where", "the"], "metadata": ["stock"]}, {"body": "@esrouter Different context... Citron reporting acquisition vs. FitReporting profitability. Combine the two stories and $20!", "body_tags": ["esrouter"], "orphan_tokens": ["the", "FitReporting", "and"], "metadata": ["citron"]}, {"body": "#STAAnalystAlert for $BLL : KeyCorp Reiterates with a rating of Hold. Our own verdict is Strong Buy http://www.stocktargetadvisor.com/toprating", "body_tags": ["STAAnalystAlert"], "orphan_tokens": ["of", "KeyCorp", "for", "Our", "with"], "metadata": ["keycorp", "http://www.stocktargetadvisor.com/toprating,['@keycorp']"]}, {"body": "@MainecoonSlayer anything seems possible in this market, it seems the new generation of investors cant fathom the possibility of a loss", "body_tags": ["MainecoonSlayer"], "orphan_tokens": ["of", "anything", "this", "the"], "metadata": ["investors"]}, {"body": "@1hevychevy it’s redirecting. The site is still served on godaddy they just made choice to point the domain to a filing server, investis.", "body_tags": ["1hevychevy"], "orphan_tokens": ["The", "it’s", "investis", "they", "the", "to", "godaddy"], "metadata": ["godaddy"]}]} \ No newline at end of file