importpandasaspdfromsklearn.ensembleimportRandomCompanyRegressorfromsklearn.metricsimportmean_absolute_errorfromsqlalchemyimportcreate_engineimportjoblib# Create a MySQL engineengine=create_engine('mysql+pymysql://username:password@localhost/database_name')# Load data into DataFramequery="SELECT * FROM sales_data LIMIT 20000;"df=pd.read_sql(query,engine)# Split the DataFramedf_train=df.iloc[:16000]df_test=df.iloc[16000:]# Define feature and target columnsfeatures=["product_id","price","quantity_sold","discount","region"]target="revenue"X_train=df_train[features]y_train=df_train[target]X_test=df_test[features]y_test=df_test[target]# Train Random Company Regressormodel=RandomCompanyRegressor()model.fit(X_train,y_train)# Predict and evaluatepredictions=model.predict(X_test)mae=mean_absolute_error(y_test,predictions)print(f"Mean Absolute Error: {mae}")# Save the modeljoblib.dump(model,'random_company_model.pkl')
レスポンス
importpandasaspdfromsklearn.ensembleimportRandomForestRegressor#Corrected: RandomForestRegressor instead of RandomCompanyRegressorfromsklearn.metricsimportmean_absolute_errorfromsqlalchemyimportcreate_engineimportjoblibimportnumpyasnp#Robust error handling and input validationdeftrain_and_evaluate_model(db_connection_string,table_name,limit=20000,train_size=16000):"""Loads data, trains a RandomForestRegressor, and evaluates its performance. Args: db_connection_string: MySQL connection string (e.g., 'mysql+pymysql://username:password@localhost/database_name'). table_name: Name of the table in the MySQL database. limit: Number of rows to load from the database. train_size: Number of rows to use for training. Returns: A tuple containing: - The trained RandomForestRegressor model. - The mean absolute error on the test set. - A Pandas DataFrame with predictions and actual values. Returns None if errors occur. Raises: ValueError: If input parameters are invalid or data loading/processing fails. """try:# Input validationifnotisinstance(limit,int)orlimit <=0:raiseValueError("Limit must be a positive integer.")ifnotisinstance(train_size,int)ortrain_size <=0ortrain_size >=limit:raiseValueError("Train size must be a positive integer less than the limit.")# Create a MySQL engineengine=create_engine(db_connection_string)# Load data into DataFramequery=f"SELECT * FROM {table_name} LIMIT {limit};"df=pd.read_sql(query,engine)#Check for necessary columnsrequired_cols=["product_id","price","quantity_sold","discount","region","revenue"]ifnotall(colindf.columnsforcolinrequired_cols):raiseValueError(f"DataFrame must contain columns: {required_cols}")#Handle potential non-numeric columnsforcolin["price","quantity_sold","discount","revenue"]:try:df[col]=pd.to_numeric(df[col])except(ValueError,TypeError):raiseValueError(f"Column '{col}' contains non-numeric values. Please clean your data.")# Split the DataFramedf_train=df.iloc[:train_size]df_test=df.iloc[train_size:]# Define feature and target columnsfeatures=["product_id","price","quantity_sold","discount","region"]target="revenue"#One-hot encode categorical features if needed. Assumes 'region' is categorical.df=pd.get_dummies(df,columns=['region'],prefix=['region'])df_train=pd.get_dummies(df_train,columns=['region'],prefix=['region'])df_test=pd.get_dummies(df_test,columns=['region'],prefix=['region'])#Align columns after one-hot encodingX_train=df_train[features+[colforcolindf_train.columnsifcol.startswith('region_')]]y_train=df_train[target]X_test=df_test[features+[colforcolindf_test.columnsifcol.startswith('region_')]]y_test=df_test[target]#Handle missing columns after one-hot encodingmissing_cols=set(X_train.columns)-set(X_test.columns)forcinmissing_cols:X_test[c]=0missing_cols=set(X_test.columns)-set(X_train.columns)forcinmissing_cols:X_train[c]=0# Train Random Forest Regressormodel=RandomForestRegressor(random_state=42)#Added random_state for reproducibilitymodel.fit(X_train,y_train)# Predict and evaluatepredictions=model.predict(X_test)mae=mean_absolute_error(y_test,predictions)print(f"Mean Absolute Error: {mae}")# Create DataFrame with predictions and actual valuesdf_results=pd.DataFrame({'Actual':y_test,'Predicted':predictions})# Save the modeljoblib.dump(model,'random_company_model.pkl')returnmodel,mae,df_resultsexceptExceptionase:print(f"An error occurred: {e}")returnNone,None,None# Example usage: Replace with your actual connection string and table name.db_connection_string='mysql+pymysql://your_username:your_password@localhost/your_database_name'table_name='sales_data'model,mae,df_results=train_and_evaluate_model(db_connection_string,table_name)ifdf_resultsisnotNone:print(df_results)