# 通过机器学习对招聘信息进行判断 **Repository Path**: liyouda/machine-learning ## Basic Information - **Project Name**: 通过机器学习对招聘信息进行判断 - **Description**: No description available - **Primary Language**: Python - **License**: Not specified - **Default Branch**: master - **Homepage**: None - **GVP Project**: No ## Statistics - **Stars**: 0 - **Forks**: 0 - **Created**: 2021-06-05 - **Last Updated**: 2021-06-05 ## Categories & Tags **Categories**: Uncategorized **Tags**: None ## README # 通过机器学习对招聘信息进行判断 #### #分析数据 训练集中共有14304个样本,每个样本有18个特征,特征具体介绍如下: job_id:工作ID号,每个工作都有唯一的ID号; title:招聘广告的标题信息; location:工作地点; department:招聘职位所属部门; salary_range:薪水范围; company_profile:公司简介; description:招聘的详细内容; requirements:入职要求; benefits:福利待遇; telecommuting:是否为远程办公; has_company_logo:公司是否有logo; has_questions:是否有筛选问题; employment_type:雇佣类型(正式、兼职、合同工等); required_experience:工作经验要求; required_education:学历要求; industry:所属行业; function:职位类别(工程师、销售等); fraudulent:是否为虚假招聘信息(是虚假信息则为1,反之为0,可作为标签)。 分析可得title,location,department,company-profile,description,requirements,bennefits,function为文本特征,telecommuting,has-company-logo,has-questions,employment-type,required-experience,required-education为离散特征。 #### #导入需要的库 import numpy as np import pandas as pd from IPython.display import display %matplotlib inline import seaborn as sns #### #导入并展示训练数据和测试数据 train = pd.read_csv('/home/aistudio/data/data66410/Train.csv') test = pd.read_csv('/home/aistudio/data/data66410/Test.csv') display(train.head(n=1),test.head(n=1)) #### #查看数据信息: train.info() test.info() train.describe() #### #采用seaborn绘图函数库作可视化分析 sns.countplot(x="telecommuting",hue="fraudulent",data = train) ![输入图片说明](https://images.gitee.com/uploads/images/2021/0605/192601_0876e167_8897852.png "屏幕截图.png") sns.countplot(x="has_company_logo",hue="fraudulent",data = train) ![输入图片说明](https://images.gitee.com/uploads/images/2021/0605/192609_6f16b5b6_8897852.png "屏幕截图.png") sns.countplot(x="has_questions",hue="fraudulent",data = train) ![输入图片说明](https://images.gitee.com/uploads/images/2021/0605/192617_c7d41427_8897852.png "屏幕截图.png") sns.countplot(x="employment_type",hue="fraudulent",data = train) ![输入图片说明](https://images.gitee.com/uploads/images/2021/0605/192624_0bf8322f_8897852.png "屏幕截图.png") sns.countplot(x="required_experience",hue="fraudulent",data = train) ![输入图片说明](https://images.gitee.com/uploads/images/2021/0605/192633_37877ecc_8897852.png "屏幕截图.png") sns.countplot(x="required_education",hue="fraudulent",data = train) ![输入图片说明](https://images.gitee.com/uploads/images/2021/0605/192645_2b3d9317_8897852.png "屏幕截图.png") sns.countplot(x="function",hue="fraudulent",data = train) ![输入图片说明](https://images.gitee.com/uploads/images/2021/0605/192650_448ed52a_8897852.png "屏幕截图.png") 因为训练集中没有连续变量,故不采用小提琴图。 #### #删掉含有缺损值的样本 train.dropna(axis=0,inplace=True) #查看训练集的信息 train.info() 由上述seaborn绘图可知,除了function之前的那些特征,其他特征的数据都过于分散没有训练价值,故删除之。 #### #将训练数据分成标记和特征两部分 labels=train['fraudulent'] features = train.drop(['function','required_education','fraudulent','job_id','title','location','department','salary_range','company_profile','description','requirements','benefits','industry'],axis=1) #### #对所有特征实现独热编码 features = pd.get_dummies(features) encoded = list(features.columns) print("{} total features after one-hot encoding.".format(len(encoded))) #### #删除不需要的特征并进行独热编码 Id=test['job_id'] test = test.drop(['function','required_education','job_id','title','location','department','salary_range','company_profile','description','requirements','benefits','industry'],axis=1) test = pd.get_dummies(test) encoded = list(test.columns) print("{} total features after one-hot encoding.".format(len(encoded))) #### #引入需要的库和函数 from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer from sklearn.metrics import accuracy_score,roc_auc_score from time import time from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.neighbors import KNeighborsClassifier from xgboost.sklearn import XGBClassifier #### #定义通用函数框架 def fit_model(alg,parameters): X=features y=labels scorer = make_scorer(roc_auc_score) grid=GridSearchCV(alg,parameters,scoring=scorer,cv=5) start=time() grid=grid.fit(X,y) end=time() t=round(end-start,3) print(grid.best_params_) print ('searching time for {} is {} s'.format(alg.__class__.__name__,t)) return grid #### #列出SVM算法 alg1=SVC(probability=True,random_state=29) #由于使用roc_auc_score作为评分标准,需将SVC中的probability参数设置为True #### #列出需要调整的参数范围 parameters1 = {"C":range(1,20), "gamma": [0.05,0.1,0.15,0.2,0.25]} #### #开始调参 clf1=fit_model(alg1,parameters1) #### #定义保存函数 def save(clf,i): pred=clf.predict(test) sub=pd.DataFrame({ 'job_id': Id, 'fraudulent': pred }) sub.to_csv("res_tan_{}.csv".format(i), index=False) #### #调用保存函数 i=1 for clf in [clf1]: save(clf,i) i=i+1 #### #生成文件 ![输入图片说明](https://images.gitee.com/uploads/images/2021/0605/192936_77dc2bd0_8897852.png "屏幕截图.png")