From bca7f605495872051a6480ee0370c3a6e96014a5 Mon Sep 17 00:00:00 2001 From: yoution Date: Thu, 22 Jul 2021 10:25:36 +0800 Subject: [PATCH] fix: issue #370 mapping emsi tags --- README.md | 7 + package.json | 3 +- scripts/emsi-mapping/esmi-skills-mapping.js | 419 ++++++++++++++++++++ scripts/emsi-mapping/index.js | 52 +++ src/services/TeamService.js | 48 ++- 5 files changed, 503 insertions(+), 26 deletions(-) create mode 100644 scripts/emsi-mapping/esmi-skills-mapping.js create mode 100644 scripts/emsi-mapping/index.js diff --git a/README.md b/README.md index 287bf7cb..458b4d3a 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,8 @@ To be able to change and test `taas-es-processor` locally you can follow the nex | `npm run migrate` | Run any migration files which haven't run yet. | | `npm run migrate:undo` | Revert most recent migration. | | `npm run demo-payment-scheduler` | Create 1000 Work Periods Payment records in with status "scheduled" and various "amount" | +| `npm run emsi-mapping` | mapping EMSI tags to topcoder skills | + ## Import and Export data @@ -334,3 +336,8 @@ When we add, update or delete models and/or endpoints we have to make sure that - **DB Migration** - If there are any updates in DB schemas, create a DB migration script inside `migrations` folder which would make any necessary updates to the DB schema. - Test, that when we migrate DB from the previous state using `npm run migrate`, we get exactly the same DB schema as if we create DB from scratch using command `npm run init-db force`. + +## EMSI mapping +mapping EMSI tags to topcoder skills +Run `npm run emsi-mapping` to create the mapping file +It will take about 15 minutes to create the mapping file `script/emsi-mapping/emsi-skils-mapping.js` diff --git a/package.json b/package.json index 3e2e7b18..17c1887b 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ "init-db": "node src/init-db.js", "create-index": "node scripts/es/createIndex.js", "delete-index": "node scripts/es/deleteIndex.js", + "emsi-mapping": "node scripts/emsi-mapping/index.js", "index:all": "node scripts/es/reIndexAll.js", "index:jobs": "node scripts/es/reIndexJobs.js", "index:job-candidates": "node scripts/es/reIndexJobCandidates.js", @@ -87,4 +88,4 @@ "test/unit/**" ] } -} \ No newline at end of file +} diff --git a/scripts/emsi-mapping/esmi-skills-mapping.js b/scripts/emsi-mapping/esmi-skills-mapping.js new file mode 100644 index 00000000..4f070178 --- /dev/null +++ b/scripts/emsi-mapping/esmi-skills-mapping.js @@ -0,0 +1,419 @@ +module.exports = { + matchedSkills: { + Dropwizard: 'Dropwizard', + Nginx: 'NGINX', + 'Machine Learning': 'Machine Learning', + 'Force.Com': 'Force.Com Sites', + 'User Interface': 'UI Prototype', + Docker: 'Docker', + Appcelerator: 'appcelerator', + Flux: 'Flux', + 'Bootstrap (FRONT-END FRAMEWORK)': 'Twitter Bootstrap', + Financialforce: 'FinancialForce', + Redis: 'Redis', + Hybris: 'Hybris', + Splunk: 'Splunk', + 'Lua (SCRIPTING LANGUAGE)': 'Lua', + 'Jface (UI TOOLKIT)': 'Jface', + Recursion: 'Recursion', + Blackberry: 'Blackberry SDK', + Xul: 'XUL', + Mapreduce: 'MapReduce', + Nosql: 'NoSQL', + Linux: 'Linux', + Elasticsearch: 'Elasticsearch', + 'Microsoft Silverlight': 'Microsoft Silverlight', + Vertica: 'Vertica', + 'Windows Servers': 'Windows Server', + 'Haskell (PROGRAMMING LANGUAGE)': 'Haskell', + Hyperledger: 'Hyperledger', + 'Apache Cordova': 'Apache Cordova', + 'Play Framework': 'Play Framework', + Zipkin: 'Zipkin', + Marklogic: 'MarkLogic', + Mysql: 'MySql', + Visualforce: 'Visualforce', + 'Data Architecture': 'IBM Rational Data Architect', + 'Windows Communication Foundation': 'Windows Communication Foundation', + 'Jboss Seam': 'JBoss Seam', + 'Java Stored Procedure (SQL)': 'Transact-SQL', + 'Component Object Model (COM)': 'COM', + 'Ubuntu (OPERATING SYSTEM)': 'ubuntu', + 'Cobol (PROGRAMMING LANGUAGE)': 'Cobol', + 'Continuous Integration': 'Continuous Integration', + 'Extensible Messaging And Presence Protocol (XMPP)': 'XMPP', + Microservices: 'Microservices', + 'Java Platform Micro Edition (J2ME)': 'J2ME', + 'Qt (SOFTWARE)': 'Qt', + 'R (PROGRAMMING LANGUAGE)': 'R', + 'Scala (PROGRAMMING LANGUAGE)': 'Scala', + 'Dynamic Programming': 'Dynamic Programming', + 'C (PROGRAMMING LANGUAGE)': 'C#', + Typescript: 'TypeScript', + Xamarin: 'Xamarin', + 'Sql Server Integration Services (SSIS)': 'SSIS', + Kubernetes: 'Kubernetes', + Inkscape: 'Inkscape', + 'Ibm Websphere Portal': 'IBM WebSphere Portal', + Matlab: 'Matlab', + Jekyll: 'Jekyll', + Cassandra: 'Cassandra', + 'Airplay Sdk (APPLE)': 'Apple HIG', + Jquery: 'jQuery Mobile', + 'Power Bi': 'Power BI', + Json: 'JSON', + 'Django (WEB FRAMEWORK)': 'Django', + 'Meteor.Js': 'Meteor.js', + Clojure: 'Clojure', + 'App Store (IOS)': 'iOS', + 'Amazon Alexa': 'Amazon Alexa', + 'Ibm Bluemix': 'IBM Bluemix', + 'Extensible Stylesheet Language (XSL)': 'XSL', + 'React.Js': 'React.js', + Gradle: 'Gradle', + Protractor: 'Protractor', + 'Java Platform Enterprise Edition (J2EE)': 'J2EE', + Drupal: 'Drupal', + 'Php (SCRIPTING LANGUAGE)': 'PHP', + 'Customer Experience': 'Customer Experience (Cx)', + Mariadb: 'MariaDB', + Grommet: 'Grommet', + Clickonce: 'ClickOnce', + 'Application Programming Interface (API)': 'API', + 'Unit Testing': 'Unit-Testing', + 'Ionic Framework': 'Ionic Framework', + Moodle: 'moodle', + Jbehave: 'JBehave', + Gremlin: 'Gremlin', + Office365: 'Office365', + 'Fortran (PROGRAMMING LANGUAGE)': 'Fortran', + 'Vue.Js': 'Vuejs', + 'Google Maps': 'Google-Maps', + 'Cloud Foundry': 'Cloud Foundry', + 'Robot Framework': 'Robot Framework', + Ethereum: 'Ethereum', + Neo4J: 'Neo4J', + 'Microsoft Dynamics': 'Microsoft Dynamics', + 'Geospatial Information Technology (GIT)': 'Git', + Predix: 'Predix', + Gitlab: 'Gitlab', + 'Windows Workflow Foundation': 'Windows Workflow Foundation', + 'Javascript (PROGRAMMING LANGUAGE)': 'JavaScript', + 'Backbone.Js': 'Backbone.js', + Jabber: 'Jabber', + Wordpress: 'Wordpress', + Devops: 'DevOps', + 'Apache Derby': 'Apache Derby', + 'Rexx (PROGRAMMING LANGUAGE)': 'IBM REXX', + 'Web Scraping': 'Web scraping', + Sorting: 'Sorting', + 'Message Broker': 'IBM Websphere Message Broker', + Openam: 'Openam', + Less: 'Less', + 'Equinox (OSGI)': 'OSGi', + 'Zend Framework': 'zend framework', + 'Sketch (DESIGN SOFTWARE)': 'Sketch', + Coffeescript: 'Coffeescript', + 'Gnu Image Manipulation Program (GIMP)': 'gimp', + 'Node.Js': 'Node.js', + Laravel: 'laravel', + 'Ruby (PROGRAMMING LANGUAGE)': 'Ruby', + Mongodb: 'MongoDB', + 'Graphic Design': 'Graphic Design', + 'Entity Framework': 'Entity-Framework', + 'Hibernate (JAVA)': 'Hibernate', + 'Data Visualization': 'Data Visualization', + 'Windows Phone': 'Windows Phone', + 'Bash (SCRIPTING LANGUAGE)': 'Bash', + 'Akka (TOOLKIT)': 'Akka', + 'Sencha Touch': 'Sencha Touch 2', + Multithreading: 'Multithreading', + Apigee: 'Apigee', + 'Iso/Iec 14882 (C++)': 'C++', + 'Ab Initio (SOFTWARE)': 'Ab Initio', + 'Python (PROGRAMMING LANGUAGE)': 'Python', + 'Big Data': 'Big data', + Vscode: 'VSCode', + Codeigniter: 'Codeigniter', + 'Grunt.Js': 'Grunt.js', + 'Swing (DANCE)': 'Swing', + 'Groovy (PROGRAMMING LANGUAGE)': 'Groovy', + Openshift: 'OpenShift', + Integration: 'IBM Integration Bus', + Compression: 'Compression', + 'Salesforce.Com': 'Salesforce.com', + 'Ibm Websphere Mq': 'IBM WebSphere MQ', + 'Information Architecture': 'Information Architecture (IA)', + 'Ember.Js': 'Ember.js', + 'Vim (TEXT EDITOR)': 'vim', + Html5: 'HTML5', + 'Custom Tag': 'Custom Tag', + 'Asp.Net': 'ASP.NET', + 'Responsive Web Design': 'Responsive Web Design', + 'Ibm Rational Software': 'IBM Rational Software Architect', + Corda: 'R3 Corda', + Phonegap: 'Phonegap', + Junit: 'Junit', + 'Graph Theory': 'Graph Theory', + 'Eclipse (SOFTWARE)': 'Eclipse', + Bigquery: 'BigQuery', + Requirejs: 'Require.js', + Flash: 'Flash', + Github: 'Github', + 'Cascading Style Sheets (CSS)': 'CSS', + 'Web Services': 'Web Services', + Phantomjs: 'Phantomjs', + Heroku: 'Heroku', + Geometry: 'Geometry', + 'Java Message Service (JMS)': 'JMS', + 'Aws Lambda': 'AWS Lambda', + Sass: 'SASS', + 'Artificial Intelligence': 'AI', + Talend: 'Talend', + Quorum: 'Quorum', + Kotlin: 'Kotlin', + 'Google Cloud': 'Google Cloud', + 'Interaction Design': 'Interaction Design (Ixd)', + Sqlite: 'Sqlite', + Postgresql: 'PostgreSQL', + 'User Experience': 'User Experience (Ux)', + Invision: 'InVision', + 'Vert.X': 'Vert.X', + Oauth: 'Oauth', + Smartsheet: 'Smartsheet', + Actionscript: 'ActionScript', + Drools: 'Drools', + 'Apache Kafka': 'Apache Kafka', + 'Perl (PROGRAMMING LANGUAGE)': 'Perl', + Parsing: 'String Parsing', + 'Product Design': 'Product Design', + Openstack: 'Openstack', + 'Android (OPERATING SYSTEM)': 'Android', + 'Google App Engines': 'Google App Engine', + 'Apache Camel': 'Apache Camel', + 'Java (PROGRAMMING LANGUAGE)': 'Java', + 'Application Servers': 'IBM Websphere Application Server', + 'Hypertext Markup Language (HTML)': 'HTML', + 'Sitemaps (XML)': 'XML', + Clojurescript: 'ClojureScript', + Blockchain: 'Blockchain', + Cartodb: 'CartoDB', + 'Oracle Databases': 'Oracle Database', + 'Ibm Lotus Domino': 'IBM Lotus Domino', + Indexeddb: 'IndexedDB', + 'Data Science': 'Data Science', + 'Ajax (PROGRAMMING LANGUAGE)': 'Ajax', + Twilio: 'Twilio', + Selenium: 'Selenium', + Trello: 'trello', + Appium: 'Appium', + Jruby: 'Jruby', + 'Ibm Db2': 'IBM DB2', + Branding: 'Branding', + '3D Reconstruction': '3D Reconstruction', + 'Ibm Aix': 'IBM AiX', + 'Active Directory': 'Active Directory' + }, + unMatchedSkills: [ + 'EJB', + 'Database', + 'Winforms', + 'Photoshop', + '.NET', + 'Leaflet.js', + 'Databasedotcom', + 'Maven', + 'Gaming', + 'Go', + 'Mobile', + 'IBM WebSphere DataStage', + 'Azure', + 'Om', + 'Lightning', + 'File', + 'Security', + 'Tableau', + 'Ibatis/Mybatis', + 'Integrator', + 'HAML', + 'SFDC Apex', + 'Responsive Design', + 'Castor', + 'Npm', + 'ipfs', + '.NET System.Addins', + 'TIBCO', + 'Boomi', + 'InDesign', + 'EC2', + 'Concept Design', + 'nodewebkit', + 'S3', + 'Mozilla', + 'sympfony', + 'Website Design', + 'Chatter', + 'Calabash', + 'Sinatra', + 'Algorithm', + 'OSx', + 'Open Source', + 'Frontend', + 'XAML', + 'VB', + 'Winforms Controls', + 'User Testing', + 'SFDC Lightening Components', + 'Forms', + 'Contentful', + 'bower', + 'Use Case Diagrams (TcUML)', + 'BizTalk', + 'Infographic', + 'Gulp', + 'Xcode', + 'Word/Rich Text', + 'Spring', + 'RMI', + 'OmniGraffle', + 'Linq', + 'Swift', + 'MESH01', + 'MSMQ', + 'yii', + 'IBM Rational Application Developer', + 'Illustrator', + 'QlikView', + 'MIDP 2.0', + 'Beanstalk', + 'JPA', + 'SWT', + 'Simulation', + 'Brute Force', + 'IBM Pl/1', + 'Cumulocity', + 'Windows', + 'IBM Cognitive', + 'Validation', + 'IDOL OnDemand', + 'Wpf', + 'Hadoop', + 'Search', + 'Actian Database', + 'Simple Math', + 'Box', + 'CSS3', + 'LoadRunner', + 'Sharepoint 3.0', + 'IBM COGNOS', + 'Dc.js', + 'Pl/Sql', + 'Cisco', + 'Web methods', + 'Aris', + 'Remoting', + 'Apex', + 'VB.NET', + 'PowerShell', + 'Q & Bluebird', + 'Microsoft Exchange', + 'Swagger', + 'Regex', + 'UML', + 'JSF', + 'WCF', + 'Zepto.js', + 'Flight.js', + 'Apache Flume', + 'IBM Cloud Private', + 'Activity Diagrams (Tcuml)', + 'Servlet', + 'Cocoa', + 'Greedy', + 'IBM Rational Team Concert', + 'DocuSign', + 'VBA', + 'AngularJS', + 'Mobile Design', + 'Actian Data', + 'doctrine', + 'JSP', + 'foundation', + 'Axure', + 'Knockout', + 'F#', + 'IBM Watson', + 'Excel', + 'Sockets', + 'Siebel', + 'QA', + 'UITableView', + 'Dynamodb', + 'Solidity', + 'Logo', + 'travis', + 'Visual-Studio', + 'Espruino', + 'REST', + 'Hashgraph', + 'tvOS', + 'atom', + 'Titanium', + 'Shell', + 'Tosca', + 'Ldap', + 'kraken.js', + 'Performance', + 'JDBC', + 'D3.JS', + 'Couchbase', + 'CloudFactor', + 'HTTP', + 'ADO.NET', + 'Dojo', + 'Applet', + 'Spark', + 'AWS', + 'Mainframe', + 'Facebook', + 'jetbrains', + 'Flex', + 'Ant', + 'SFDC Mobile', + 'HPE Haven OnDemand', + 'Oracle', + 'JavaBean', + 'Salesforce', + 'Struts', + 'Function', + 'Class', + 'IBM Lotus Notes', + 'SCSS', + 'Brivo Labs', + 'SAP', + 'Multichain', + 'List', + 'Express', + 'gulp', + 'JMeter', + 'Math', + 'Image', + 'Commerce Server 2009', + 'IBM Design', + 'Print', + 'Advanced Math', + 'SFDC REST APIs', + 'String Manipulation', + 'chrome', + 'String', + 'SFDC Design', + 'CA', + 'Oracle EBS', + 'Golang', + 'Simple Search', + 'Pega', + 'Cognitive', + 'redhat', + 'Marvel - Design' + ] +} diff --git a/scripts/emsi-mapping/index.js b/scripts/emsi-mapping/index.js new file mode 100644 index 00000000..58245fd4 --- /dev/null +++ b/scripts/emsi-mapping/index.js @@ -0,0 +1,52 @@ +/** + * mapping emsi skills to topcoder skills + */ + +const fs = require('fs') +const path = require('path') +const logger = require('../../src/common/logger') +const helper = require('../../src/common/helper') + +async function mappingSkill () { + const matchedSkills = {} + const unMatchedSkills = [] + const failedSkills = [] + let tcSkills + const startTime = Date.now() + try { + tcSkills = await helper.getAllTopcoderSkills() + } catch (e) { + logger.error({ component: 'getAllTopcoderSkills', context: 'emsi-mapping', message: JSON.stringify(e) }) + } + + for (let i = 0; i < tcSkills.length; i++) { + const tcSkill = tcSkills[i] + let emsiTags + try { + emsiTags = await helper.getTags(tcSkill.name) + } catch (e) { + failedSkills.push(tcSkill.name) + logger.error({ component: 'getTags', context: 'emsi-mapping', message: JSON.stringify(e) }) + } + if (emsiTags.length) { + matchedSkills[emsiTags[0].tag] = tcSkill.name + } else { + unMatchedSkills.push(tcSkill.name) + } + } + + const textString = `module.exports = { matchedSkills: ${JSON.stringify(matchedSkills, 2, 3)}, unMatchedSkills: ${JSON.stringify(unMatchedSkills, 2, 2)} }` + const filePath = path.join(__dirname, 'emsi-skills-mapping.js') + const result = { + totalTime: (Date.now() - startTime) / 60 / 1000 + ' min', + totalSkills: tcSkills.length, + matchedSkills: tcSkills.length - unMatchedSkills.length, + unMatchedSkills: unMatchedSkills.length, + filePath, + failSkills: failedSkills + } + + logger.info({ component: 'emsi-mapping', context: 'emsi-mapping', message: JSON.stringify(result) }) + fs.writeFileSync(filePath, textString) +} +mappingSkill() diff --git a/src/services/TeamService.js b/src/services/TeamService.js index 375f5c2f..84b43d1f 100644 --- a/src/services/TeamService.js +++ b/src/services/TeamService.js @@ -17,9 +17,9 @@ const { Op } = require('sequelize') const models = require('../models') const stopWords = require('../../data/stopWords.json') const { getAuditM2Muser } = require('../common/helper') +const { matchedSkills, unMatchedSkills } = require('../../scripts/emsi-mapping/esmi-skills-mapping') const Role = models.Role const RoleSearchRequest = models.RoleSearchRequest -const topcoderSkills = {} const emailTemplates = _.mapValues(emailTemplateConfig, (template) => { return { @@ -64,29 +64,19 @@ async function _getJobsByProjectIds (currentUser, projectIds) { } /** - * Gets topcoder skills and stores their name and compiled - * regex patters according to Levenshtein distance <=1 + * compiled regex patters according to Levenshtein distance <=1 for unmatched skills from EMSI + * @returns {Array} the unMatched skills with regex pattern */ -async function _reloadCachedTopcoderSkills () { - // do not reload if cache time is not expired - if (!_.isUndefined(topcoderSkills.time)) { - const cacheTime = config.TOPCODER_SKILLS_CACHE_TIME * 60 * 1000 - if (new Date().getTime() - topcoderSkills.time < cacheTime) { - return - } - } - // collect all skills - const skills = await helper.getAllTopcoderSkills() - // set the last cached time - topcoderSkills.time = new Date().getTime() - topcoderSkills.skills = [] +function compileRegexPatternForNoEmsiSkills () { + const unMatched = [] // store skill names and compiled regex paterns - _.each(skills, skill => { - topcoderSkills.skills.push({ - name: skill.name, - pattern: _compileRegexPatternForSkillName(skill.name) + _.each(unMatchedSkills, skill => { + unMatched.push({ + name: skill, + pattern: _compileRegexPatternForSkillName(skill) }) }) + return unMatched } /** @@ -835,18 +825,17 @@ getRoleBySkills.schema = Joi.object() }).required() /** - * Return skills by job description. + * Return skills by job description from EMSI. * * @param {Object} currentUser the user who perform this operation. * @param {Object} data the search criteria * @returns {Object} the result */ async function getSkillsByJobDescription (data) { - // load topcoder skills if needed. Using cached skills helps to avoid - // unnecessary api calls which is extremely time comsuming. - await _reloadCachedTopcoderSkills() // replace markdown tags with spaces const description = helper.removeTextFormatting(data.description) + // get skill from emsi + const emsiTags = await helper.getTags(description) // extract words from description let words = _.split(description, ' ') // remove stopwords from description @@ -858,11 +847,20 @@ async function getSkillsByJobDescription (data) { } words = _.concat(words, twoWords) let foundSkills = [] + // add emsi parsed skills + _.each(emsiTags, (t) => { + if (matchedSkills[t.tag]) { + foundSkills.push(matchedSkills[t.tag]) + } + }) + + // unmatctched skill + const unMatchedTopcoderSkills = compileRegexPatternForNoEmsiSkills() const result = [] // try to match each word with skill names // using pre-compiled regex pattern _.each(words, word => { - _.each(topcoderSkills.skills, skill => { + _.each(unMatchedTopcoderSkills, skill => { // do not stop searching after a match in order to detect more lookalikes if (skill.pattern.test(word)) { foundSkills.push(skill.name)