importnumpyasnpimportpandasaspdfromsklearn.model_selectionimporttrain_test_splitfromsklearn.imputeimportSimpleImputerfromsklearn.preprocessingimportOneHotEncoder,StandardScalerfromsklearn.composeimportColumnTransformerfromsklearn.pipelineimportPipelinefromimblearn.over_samplingimportSMOTENCdf=pd.read_csv('my_dataset.csv')# Remove duplicate rows from the datasetdf.drop_duplicates(inplace=True)# Separate the features (X) and the target variable (y)X=df.drop('target',axis=1)y=df['target']# Split the data into training and test setsX_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)# Check class distributionprint(f"Originaltrainingsetclassdistribution:{np.bincount(y_train)}")# Identify numeric and categorical featuresnumeric_features=[colforcolinX_train.columnsif 'num_feature' incol]categorical_features=[colforcolinX_train.columnsif 'cat_feature' incol]# Preprocessing pipeline for numeric missing values and feature scalingnumeric_transformer=Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),('scaler',StandardScaler())])# Preprocessing pipeline for categorical missing values and encodingcategorical_transformer=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),('onehot',OneHotEncoder(handle_unknown='ignore'))])# Preprocessingpreprocessor=ColumnTransformer(transformers=[('num',numeric_transformer,numeric_features),('cat',categorical_transformer,categorical_features)])X_train_processed=preprocessor.fit_transform(X_train)X_test_processed=preprocessor.transform(X_test)# Class balancingcategorical_indices=[X_train.columns.get_loc(col)forcolincategorical_features]smote_nc=SMOTENC(categorical_features=categorical_indices,random_state=42)X_train_resampled,y_train_resampled=smote_nc.fit_resample(X_train_processed,y_train)print(f"Resampledtrainingsetclassdistribution:{np.bincount(y_train_resampled)}")
[[["易于理解","easyToUnderstand","thumb-up"],["解决了我的问题","solvedMyProblem","thumb-up"],["其他","otherUp","thumb-up"]],[["Hard to understand","hardToUnderstand","thumb-down"],["Incorrect information or sample code","incorrectInformationOrSampleCode","thumb-down"],["Missing the information/samples I need","missingTheInformationSamplesINeed","thumb-down"],["翻译问题","translationIssue","thumb-down"],["其他","otherDown","thumb-down"]],["最后更新时间 (UTC):2024-11-26。"],[],[]]