Unverified
comandsv2
import json import joblib from datasets import Dataset from matplotlib import pyplot as plt from sklearn.calibration import CalibratedClassifierCV, calibration_curve from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_curve, auc import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from modeling.ClasicModels.performance_utils import plot_roc_curve, plot_pr_curve, plt_show, get_model_performance, \ get_threshold, evaluate_new, print_statistics # Function to conditionally merge dataframes based on flags def conditional_merge(base_df, merge_df, merge_flag, merge_col='PATIENT_UNIQUE_NUMBER'): if merge_flag: base_df = pd.merge(base_df, merge_df, left_on='id', right_on=merge_col, how='left') base_df.drop(columns=[merge_col], inplace=True) return base_df def read_observation_data_conditionally(file_tuple, path, flag): if flag: data = pd.read_csv(path + file_tuple[0], low_memory=False) data[file_tuple[1]] = data[file_tuple[1]].astype(str) return data return None # Flags to enable/disable merging of specific dataframes observation_data_flag = True pharmacy_observations_flag = True radiology_observations_flag = True vital_observations_flag = True speciality_and_facilitiy_observations_flag = True visit_observations_flag = True lab_observations_flag = True prediction_mode = True train_mode = False model_id = 'd' # Define file and column name tuples based on train_mode flag if train_mode: observation_path = r"C:/Users/dev1/Documents/CHI_Projects/data_v4/" patient_data = ('p_data_d_v2.csv', 'PATIENT_UNIQUE_NUMBER') observation_data = ('basic_observations.csv', 'PATIENT_UNIQUE_NUMBER') pharmacy_observations = ('Pharmacy_observations.csv', 'PATIENT_UNIQUE_NUMBER') radiology_observations = ('Radiology_observations.csv', 'PATIENT_UNIQUE_NUMBER') vital_observations = ('Vital_observations.csv', 'patient_unique_number') speciality_and_facility_observations = ('speciality_and_facility_codes_observations.csv', 'patient_unique_number') visit_observations = ('reason_for_visit_observations.csv', 'patient_unique_number') lab_observations = ('LAB_INFO_observations.csv', 'patient_unique_number') else: observation_path = r"C:/Users/dev1/Documents/CHI_Projects/data_2023/" patient_data = ('p_data_pred_d.csv', 'PATIENT_UNIQUE_NUMBER') observation_data = ('basic_observations_2023.csv', 'PATIENT_UNIQUE_NUMBER') pharmacy_observations = ('pharmacy_observations_2023.csv', 'patient_id') radiology_observations = ('radiology_observations_2023.csv', 'patient_id') vital_observations = ('vital_observations_2023.csv', 'patient_id') speciality_and_facility_observations = ('facility_speciality_observations_2023.csv', 'patient_id') visit_observations = ('visit_reason_observations_2023.csv', 'patient_id') lab_observations = ('lab_observations_2023.csv', 'patient_id') # Read the main dataframe df = pd.read_csv(observation_path + patient_data[0], low_memory=False) df['PATIENT_UNIQUE_NUMBER'] = df[patient_data[1]].astype(str) df.columns = ['id', 'label'] if len(df.columns) == 2 else ['id'] # Function to read observation data observation_data_df = read_observation_data_conditionally(observation_data, observation_path, observation_data_flag) pharmacy_observations_df = read_observation_data_conditionally(pharmacy_observations, observation_path, pharmacy_observations_flag) radiology_observations_df = read_observation_data_conditionally(radiology_observations, observation_path, radiology_observations_flag) vital_observations_df = read_observation_data_conditionally(vital_observations, observation_path, vital_observations_flag) speciality_and_facility_observations_df = read_observation_data_conditionally(speciality_and_facility_observations, observation_path, speciality_and_facilitiy_observations_flag) visit_observations_df = read_observation_data_conditionally(visit_observations, observation_path, visit_observations_flag) lab_observations_df = read_observation_data_conditionally(lab_observations, observation_path, lab_observations_flag) if train_mode: train_df, test_df = train_test_split(df, test_size=0.3, random_state=42) else: test_df = df train_df = pd.DataFrame(columns=df.columns) train_df['id'] = train_df['id'].str.strip() test_df['id'] = test_df['id'].str.strip() take_sample = False if take_sample: true_rows = train_df[train_df['label'] == 1] false_rows = train_df[train_df['label'] == 0] sample_size = min(len(true_rows), len(false_rows)) false_sample = false_rows.sample(n=sample_size, random_state=42) train_df = pd.concat([true_rows, false_sample]) # test_df = test_df.sample(frac = 0.1) print('train_df.shape', train_df.shape) train_df = conditional_merge(train_df, observation_data_df, observation_data_flag, merge_col=observation_data[1]) train_df = conditional_merge(train_df, pharmacy_observations_df, pharmacy_observations_flag, merge_col=pharmacy_observations[1]) train_df = conditional_merge(train_df, radiology_observations_df, radiology_observations_flag, merge_col=radiology_observations[1]) train_df = conditional_merge(train_df, vital_observations_df, vital_observations_flag, merge_col=vital_observations[1]) train_df = conditional_merge(train_df, speciality_and_facility_observations_df, speciality_and_facilitiy_observations_flag, merge_col=speciality_and_facility_observations[1]) train_df = conditional_merge(train_df, visit_observations_df, visit_observations_flag, merge_col=visit_observations[1]) train_df = conditional_merge(train_df, lab_observations_df, lab_observations_flag, merge_col=lab_observations[1]) print('train_df.shape', train_df.shape) test_df = conditional_merge(test_df, observation_data_df, observation_data_flag, merge_col=observation_data[1]) test_df = conditional_merge(test_df, pharmacy_observations_df, pharmacy_observations_flag, merge_col=pharmacy_observations[1]) test_df = conditional_merge(test_df, radiology_observations_df, radiology_observations_flag, merge_col=radiology_observations[1]) test_df = conditional_merge(test_df, vital_observations_df, vital_observations_flag, merge_col=vital_observations[1]) test_df = conditional_merge(test_df, speciality_and_facility_observations_df, speciality_and_facilitiy_observations_flag, merge_col=speciality_and_facility_observations[1]) test_df = conditional_merge(test_df, visit_observations_df, visit_observations_flag, merge_col=visit_observations[1]) test_df = conditional_merge(test_df, lab_observations_df, lab_observations_flag, merge_col=lab_observations[1]) print('statistics for train') print_statistics(train_df) print('statistics for test') print_statistics(test_df) train_data = Dataset.from_pandas(train_df) eval_data = Dataset.from_pandas(test_df) def preprocess_function(batch): # Extract the list of ids observation_data_columns = ['PRINICIPAL_DIAGNOSIS', 'SECONDARY_DIAGNOSIS', 'AGE_CATEGORY', 'MARITAL_STATUS', 'GENDER', 'BLOOD_GROUP'] text_parts = [] if observation_data_flag: text_parts.extend([batch[col] for col in observation_data_columns]) if pharmacy_observations_flag: text_parts.append(batch['pharamacy_observations']) if radiology_observations_flag: text_parts.append(batch['radiology_observations']) if speciality_and_facilitiy_observations_flag: text_parts.append(batch['facility_and_specialty_codes_observations']) if vital_observations_flag: text_parts.append(batch['vital_observations']) if visit_observations_flag: text_parts.append(batch['reason_for_visit']) if lab_observations_flag: text_parts.append(batch['lab_info_ovservations']) batch['text'] = ['\n'.join([str(part) if part else "" for part in parts]) for parts in zip(*text_parts)] return batch train_data = train_data.map(preprocess_function, batched=True, batch_size=1000) eval_data = eval_data.map(preprocess_function, batched=True, batch_size=1000) if train_mode: vectorizer = TfidfVectorizer() patient_history_train = vectorizer.fit_transform(train_data['text']) joblib.dump(vectorizer, f'vectorizer_{model_id}.pkl') else: vectorizer = joblib.load(f'vectorizer_{model_id}.pkl') patient_history_test = vectorizer.transform(eval_data['text']) if train_mode: print('start training') model = LogisticRegression(class_weight='balanced', C=10) model.fit(patient_history_train, train_df['label']) print('end training') joblib.dump(model, f'model_{model_id}.pkl') else: model = joblib.load(f'model_{model_id}.pkl') predictions = model.predict_proba(patient_history_test) print('end predictions') predictions = predictions[:, 1] ################################# if prediction_mode: df_eval = pd.DataFrame({'patient_id': test_df['id'], 'y_pred': predictions}) df_eval.to_csv(f'predictions_{model_id}.csv') if train_mode: cm = confusion_matrix(test_df['label'], np.round(predictions)) print("Confusion Matrix:") print(cm) ############################################### df_eval = pd.DataFrame({'patient_id': test_df['id'], 'y_truth': test_df['label'], 'y_pred': predictions}) auprc, auroc = evaluate_new(df_eval, y_truth_column_name='y_truth', y_pred_column_name='y_pred') print('auprc : ', auprc) print('auroc : ', auroc) ############ CalibratedClassifierCV ##################### print('CalibratedClassifier .. ') calibrated_clf = CalibratedClassifierCV(estimator=model, method='isotonic', cv= 'prefit') calibrated_clf.fit(patient_history_train, train_df['label']) # Predictions and calibration curve print('Predictions and calibration curve') y_pred = calibrated_clf.predict_proba(patient_history_test)[:, 1] fraction_of_positives, mean_predicted_value = calibration_curve(test_df['label'], y_pred, n_bins=10) df_eval['calibrated_pred'] = y_pred auprc, auroc = evaluate_new(df_eval, y_truth_column_name='y_truth', y_pred_column_name='calibrated_pred') print('auprc : ', auprc) print('auroc : ', auroc) # Reshape for linear regression mean_predicted_value = mean_predicted_value.reshape(-1, 1) reg = LinearRegression().fit(mean_predicted_value, fraction_of_positives) slope = reg.coef_[0] intercept = reg.intercept_ print(f"Calibration Slope: {slope}") print(f"Calibration Intercept: {intercept}") df_eval.to_csv(f'valid_predictions_{model_id}.csv')
Javascript
Voted: 0 by 0 user(s)
import json import joblib from datasets import Dataset from matplotlib import pyplot as plt from sklearn.calibration import CalibratedClassifierCV, calibration_curve from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_curve, auc import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from modeling.ClasicModels.performance_utils import plot_roc_curve, plot_pr_curve, plt_show, get_model_performance, \ get_threshold, evaluate_new, print_statistics # Function to conditionally merge dataframes based on flags def conditional_merge(base_df, merge_df, merge_flag, merge_col='PATIENT_UNIQUE_NUMBER'): if merge_flag: base_df = pd.merge(base_df, merge_df, left_on='id', right_on=merge_col, how='left') base_df.drop(columns=[merge_col], inplace=True) return base_df def read_observation_data_conditionally(file_tuple, path, flag): if flag: data = pd.read_csv(path + file_tuple[0], low_memory=False) data[file_tuple[1]] = data[file_tuple[1]].astype(str) return data return None # Flags to enable/disable merging of specific dataframes observation_data_flag = True pharmacy_observations_flag = True radiology_observations_flag = True vital_observations_flag = True speciality_and_facilitiy_observations_flag = True visit_observations_flag = True lab_observations_flag = True prediction_mode = True train_mode = False model_id = 'd' # Define file and column name tuples based on train_mode flag if train_mode: observation_path = r"C:/Users/dev1/Documents/CHI_Projects/data_v4/" patient_data = ('p_data_d_v2.csv', 'PATIENT_UNIQUE_NUMBER') observation_data = ('basic_observations.csv', 'PATIENT_UNIQUE_NUMBER') pharmacy_observations = ('Pharmacy_observations.csv', 'PATIENT_UNIQUE_NUMBER') radiology_observations = ('Radiology_observations.csv', 'PATIENT_UNIQUE_NUMBER') vital_observations = ('Vital_observations.csv', 'patient_unique_number') speciality_and_facility_observations = ('speciality_and_facility_codes_observations.csv', 'patient_unique_number') visit_observations = ('reason_for_visit_observations.csv', 'patient_unique_number') lab_observations = ('LAB_INFO_observations.csv', 'patient_unique_number') else: observation_path = r"C:/Users/dev1/Documents/CHI_Projects/data_2023/" patient_data = ('p_data_pred_d.csv', 'PATIENT_UNIQUE_NUMBER') observation_data = ('basic_observations_2023.csv', 'PATIENT_UNIQUE_NUMBER') pharmacy_observations = ('pharmacy_observations_2023.csv', 'patient_id') radiology_observations = ('radiology_observations_2023.csv', 'patient_id') vital_observations = ('vital_observations_2023.csv', 'patient_id') speciality_and_facility_observations = ('facility_speciality_observations_2023.csv', 'patient_id') visit_observations = ('visit_reason_observations_2023.csv', 'patient_id') lab_observations = ('lab_observations_2023.csv', 'patient_id') # Read the main dataframe df = pd.read_csv(observation_path + patient_data[0], low_memory=False) df['PATIENT_UNIQUE_NUMBER'] = df[patient_data[1]].astype(str) df.columns = ['id', 'label'] if len(df.columns) == 2 else ['id'] # Function to read observation data observation_data_df = read_observation_data_conditionally(observation_data, observation_path, observation_data_flag) pharmacy_observations_df = read_observation_data_conditionally(pharmacy_observations, observation_path, pharmacy_observations_flag) radiology_observations_df = read_observation_data_conditionally(radiology_observations, observation_path, radiology_observations_flag) vital_observations_df = read_observation_data_conditionally(vital_observations, observation_path, vital_observations_flag) speciality_and_facility_observations_df = read_observation_data_conditionally(speciality_and_facility_observations, observation_path, speciality_and_facilitiy_observations_flag) visit_observations_df = read_observation_data_conditionally(visit_observations, observation_path, visit_observations_flag) lab_observations_df = read_observation_data_conditionally(lab_observations, observation_path, lab_observations_flag) if train_mode: train_df, test_df = train_test_split(df, test_size=0.3, random_state=42) else: test_df = df train_df = pd.DataFrame(columns=df.columns) train_df['id'] = train_df['id'].str.strip() test_df['id'] = test_df['id'].str.strip() take_sample = False if take_sample: true_rows = train_df[train_df['label'] == 1] false_rows = train_df[train_df['label'] == 0] sample_size = min(len(true_rows), len(false_rows)) false_sample = false_rows.sample(n=sample_size, random_state=42) train_df = pd.concat([true_rows, false_sample]) # test_df = test_df.sample(frac = 0.1) print('train_df.shape', train_df.shape) train_df = conditional_merge(train_df, observation_data_df, observation_data_flag, merge_col=observation_data[1]) train_df = conditional_merge(train_df, pharmacy_observations_df, pharmacy_observations_flag, merge_col=pharmacy_observations[1]) train_df = conditional_merge(train_df, radiology_observations_df, radiology_observations_flag, merge_col=radiology_observations[1]) train_df = conditional_merge(train_df, vital_observations_df, vital_observations_flag, merge_col=vital_observations[1]) train_df = conditional_merge(train_df, speciality_and_facility_observations_df, speciality_and_facilitiy_observations_flag, merge_col=speciality_and_facility_observations[1]) train_df = conditional_merge(train_df, visit_observations_df, visit_observations_flag, merge_col=visit_observations[1]) train_df = conditional_merge(train_df, lab_observations_df, lab_observations_flag, merge_col=lab_observations[1]) print('train_df.shape', train_df.shape) test_df = conditional_merge(test_df, observation_data_df, observation_data_flag, merge_col=observation_data[1]) test_df = conditional_merge(test_df, pharmacy_observations_df, pharmacy_observations_flag, merge_col=pharmacy_observations[1]) test_df = conditional_merge(test_df, radiology_observations_df, radiology_observations_flag, merge_col=radiology_observations[1]) test_df = conditional_merge(test_df, vital_observations_df, vital_observations_flag, merge_col=vital_observations[1]) test_df = conditional_merge(test_df, speciality_and_facility_observations_df, speciality_and_facilitiy_observations_flag, merge_col=speciality_and_facility_observations[1]) test_df = conditional_merge(test_df, visit_observations_df, visit_observations_flag, merge_col=visit_observations[1]) test_df = conditional_merge(test_df, lab_observations_df, lab_observations_flag, merge_col=lab_observations[1]) print('statistics for train') print_statistics(train_df) print('statistics for test') print_statistics(test_df) train_data = Dataset.from_pandas(train_df) eval_data = Dataset.from_pandas(test_df) def preprocess_function(batch): # Extract the list of ids observation_data_columns = ['PRINICIPAL_DIAGNOSIS', 'SECONDARY_DIAGNOSIS', 'AGE_CATEGORY', 'MARITAL_STATUS', 'GENDER', 'BLOOD_GROUP'] text_parts = [] if observation_data_flag: text_parts.extend([batch[col] for col in observation_data_columns]) if pharmacy_observations_flag: text_parts.append(batch['pharamacy_observations']) if radiology_observations_flag: text_parts.append(batch['radiology_observations']) if speciality_and_facilitiy_observations_flag: text_parts.append(batch['facility_and_specialty_codes_observations']) if vital_observations_flag: text_parts.append(batch['vital_observations']) if visit_observations_flag: text_parts.append(batch['reason_for_visit']) if lab_observations_flag: text_parts.append(batch['lab_info_ovservations']) batch['text'] = ['\n'.join([str(part) if part else "" for part in parts]) for parts in zip(*text_parts)] return batch train_data = train_data.map(preprocess_function, batched=True, batch_size=1000) eval_data = eval_data.map(preprocess_function, batched=True, batch_size=1000) if train_mode: vectorizer = TfidfVectorizer() patient_history_train = vectorizer.fit_transform(train_data['text']) joblib.dump(vectorizer, f'vectorizer_{model_id}.pkl') else: vectorizer = joblib.load(f'vectorizer_{model_id}.pkl') patient_history_test = vectorizer.transform(eval_data['text']) if train_mode: print('start training') model = LogisticRegression(class_weight='balanced', C=10) model.fit(patient_history_train, train_df['label']) print('end training') joblib.dump(model, f'model_{model_id}.pkl') else: model = joblib.load(f'model_{model_id}.pkl') predictions = model.predict_proba(patient_history_test) print('end predictions') predictions = predictions[:, 1] ################################# if prediction_mode: df_eval = pd.DataFrame({'patient_id': test_df['id'], 'y_pred': predictions}) df_eval.to_csv(f'predictions_{model_id}.csv') if train_mode: cm = confusion_matrix(test_df['label'], np.round(predictions)) print("Confusion Matrix:") print(cm) ############################################### df_eval = pd.DataFrame({'patient_id': test_df['id'], 'y_truth': test_df['label'], 'y_pred': predictions}) auprc, auroc = evaluate_new(df_eval, y_truth_column_name='y_truth', y_pred_column_name='y_pred') print('auprc : ', auprc) print('auroc : ', auroc) ############ CalibratedClassifierCV ##################### print('CalibratedClassifier .. ') calibrated_clf = CalibratedClassifierCV(estimator=model, method='isotonic', cv= 'prefit') calibrated_clf.fit(patient_history_train, train_df['label']) # Predictions and calibration curve print('Predictions and calibration curve') y_pred = calibrated_clf.predict_proba(patient_history_test)[:, 1] fraction_of_positives, mean_predicted_value = calibration_curve(test_df['label'], y_pred, n_bins=10) df_eval['calibrated_pred'] = y_pred auprc, auroc = evaluate_new(df_eval, y_truth_column_name='y_truth', y_pred_column_name='calibrated_pred') print('auprc : ', auprc) print('auroc : ', auroc) # Reshape for linear regression mean_predicted_value = mean_predicted_value.reshape(-1, 1) reg = LinearRegression().fit(mean_predicted_value, fraction_of_positives) slope = reg.coef_[0] intercept = reg.intercept_ print(f"Calibration Slope: {slope}") print(f"Calibration Intercept: {intercept}") df_eval.to_csv(f'valid_predictions_{model_id}.csv')
Scroll down to see more snippets from this codevault.
The author has indicated that this snippet is compatable up to wordpress version: 6.4
Free & Pro
Download this snippet by clicking the download button, then head over to the Code Snippet Plugin settings in your wordpress admin dashboard, select the import menu then upload this file to import into your wordpress site.
Pro Only (Coming Soon)
You will be able to click a button and sync this snippet to your wordpress site automatically and from your dashboard manage all code snippets across all your wordpress sites that have the Code Snippets Pro plugin installed.
Last modified:
25/06/2024
This snippet has the following status:
Unverified
This snippet has not been verified, use with caution and at your own risk. See details provided by author in sidebar and click below to find out more.
0
3 weeks ago
new_diabetes_df = pd.DataFrame({ 'PATIENT_UNIQUE_NUMBER': list(new_diabetes_patients_2023), 'label': 1 })
3 weeks ago
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
3 weeks ago
AUROC: 0.6946457345970272 Confusion Matrix: [[379435 258899] [ 6425 13675]]
3 weeks ago
filtered_df = df[df['PATIENT_UNIQUE_NUMBER'].isin(new_diabetes_df['PATIENT_UNIQUE_NUMBER'])]
3 weeks ago
def string_agg(series): return ', '.join(series.dropna().astype(str).unique()) # Group by PATIENT_UNIQUE_NUMBER and aggregate other columns aggregated_df = filtered_df.groupby('PATIENT_UNIQUE_NUMBER').agg(string_agg).reset_index()
3 weeks ago
columns_to_aggregate = [col for col in filtered_df.columns if col != 'PATIENT_UNIQUE_NUMBER'] # Apply the aggregation function to each column separately using apply method aggregated_df = filtered_df.groupby('PATIENT_UNIQUE_NUMBER').apply(lambda x: pd.Series({col: string_agg(x[col]) for col in columns_to_aggregate})).reset_index()
3 weeks ago
df['DATE_OF_BIRTH'] = pd.to_datetime(df['DATE_OF_BIRTH'], errors='coerce') # Compute age, handle nulls by filling with NaN df['AGE'] = (df['DATE'] - df['DATE_OF_BIRTH']).dt.days // 365
3 weeks ago
# Create a dictionary for the aggregation functions agg_dict = {col: string_agg for col in columns_to_aggregate} # Group by PATIENT_UNIQUE_NUMBER and aggregate other columns using the agg_dict aggregated_df = filtered_df.groupby('PATIENT_UNIQUE_NUMBER').agg(agg_dict).reset_index()
3 weeks ago
df['DATE'] = pd.to_datetime(df['DATE']) df['DATE_OF_BIRTH'] = pd.to_datetime(df['DATE_OF_BIRTH'], errors='coerce') # Compute age, handle nulls by filling with NaN df['AGE'] = (df['DATE'] - df['DATE_OF_BIRTH']).dt.days // 365 # Filter rows where AGE is greater than 18 or AGE is NaN filtered_df = df[(df['AGE'] > 18) | df['AGE'].isna()]
3 weeks ago
# Categorize age into intervals bins = [18, 20, 30, 40, 50, 60, 70, 80, 90, 100, np.inf] labels = ["18_20", "20_30", "30_40", "40_50", "50_60", "60_70", "70_80", "80_90", "90_100", "100+"] filtered_df['AGE_CATEGORY'] = pd.cut(filtered_df['AGE'], bins=bins, labels=labels, right=False)
3 weeks ago
df = df.drop(columns=['DATE_OF_BIRTH'])
3 weeks ago
batch['text'] = [c1 + " " + c2 for c1, c2 in zip(batch['column1'], batch['column2'])]
3 weeks ago
df['AGE'] = (df['DATE'] - df['DATE_OF_BIRTH']).dt.days // 365 OverflowError: Overflow in int64 addition
3 weeks ago
def calculate_age(birth_date, current_date): try: # Check for invalid dates if pd.isnull(birth_date) or pd.isnull(current_date): return None # Calculate the difference in years age = current_date.year - birth_date.year # Adjust the age if the current date is before the birth date in the current year if (current_date.month, current_date.day) < (birth_date.month, birth_date.day): age -= 1 return age except Exception as e: return None # Apply the function to the DataFrame df['AGE'] = df.apply(lambda row: calculate_age(row['DATE_OF_BIRTH'], row['DATE']), axis=1)
3 weeks ago
merged_df = pd.merge(df1, df2, on=['key1', 'key2'], how='inner')
3 weeks ago
code_dict = { 1: 'Single', 2: 'Married', 3: 'Separated', 4: 'Divorced', 5: 'Widowed', 88: 'Others', 99: 'Unknown' } # Cast 'code' column to numeric, coercing errors to NaN df['code'] = pd.to_numeric(df['code'], errors='coerce').astype('Int64') # Mapping the 'code' column to the 'value' column df['value'] = df['code'].map(code_dict) # Dropping the 'code' column df.drop('code', axis=1, inplace=True)
3 weeks ago
Unique patients in 2022: 3330513 Unique encounters in 2022: 9690628 Unique patients in 2023: 7227451 Unique encounters in 2023: 21893534 Unique overlapping patients in 2022/2023: 1958711 Unique diabetes patients in 2022: 268496 Unique diabetes patients in 2023: 659478 Unique new diabetes patients in 2023: 478638 Unique new diabetes patients in 2023 but exist in 2022: 85817
3 weeks ago
def preprocess_function(batch): # Extract the list of ids #ids = examples['id'] batch['text'] = [c1 if c1 else "" + " " + c2 if c2 else "" + " " + c3 if c3 else ""+ " " + c4 if c4 else "" + " " + c5 if c5 else "" + " " + c6 if c6 else "" for c1, c2, c3, c4, c5, c6 in zip(batch['PRINICIPAL_DIAGNOSIS'], batch['SECONDARY_DIAGNOSIS'],batch['AGE_CATEGORY'],batch['MARITAL_STATUS'],batch['GENDER'],batch['BLOOD_GROUP'] )] print(batch['text']) return batch
3 weeks ago
AUROC: 0.7870992018148124 Confusion Matrix: [[362472 130318] [ 7610 18012]]
3 weeks ago
lightgbm
3 weeks ago
import shap
2 weeks ago
def string_agg(series): return ', '.join(series.dropna().astype(str).unique()) columns_to_aggregate = [col for col in filtered_df.columns if col in ['PRINICIPAL_DIAGNOSIS','SECONDARY_DIAGNOSIS', 'AGE_CATEGORY', 'MARITAL_STATUS','BLOOD_GROUP','GENDER']] # Create a dictionary for the aggregation functions agg_dict = {col: string_agg for col in columns_to_aggregate} # Group by PATIENT_UNIQUE_NUMBER and aggregate other columns using the agg_dict aggregated_df = filtered_df.groupby('PATIENT_UNIQUE_NUMBER').agg(agg_dict).reset_index()
2 weeks ago
df_2022 = df[df['DATE'].dt.year == 2022]
2 weeks ago
# Categorize age into intervals bins = [18, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150] labels = ["18_20", "20_30", "30_40", "40_50", "50_60", "60_70", "70_80", "80_90", "90_100", "100+"] df['AGE_CATEGORY'] = pd.cut(df['AGE'], bins=bins, labels=labels, right=False) df = df.drop(columns=['AGE'])
2 weeks ago
' 30_40 weight_category: blood_pressure_category: '
2 weeks ago
'Z36 30_40 weight_category: 60_70 blood_pressure_category: Prehypertension'
2 weeks ago
'S51, Z48.0, Z48, S60 30_40 weight_category: 60_70 blood_pressure_category: Hypertension, Prehypertension'
2 weeks ago
'nan, R07.3, R51, R07.4, M79.66, S93.40 nan, R07.3, R51, R07.4, M79.66, S93.40 18_20, 40_50, 20_30 nan, MARRIED, SINGLE Male, Female weight_category: 50_60, 60_70 blood_pressure_category: '
2 weeks ago
'B21, nan nan, Z01.9 50_60, 40_50 Married Male Unspecified, nan weight_category: 30_40, 60_70 blood_pressure_category: , Low, Prehypertension'
2 weeks ago
'H26.9, H35.8, H04.1, Z96.1, H26.4, nan, R11 H26.9, H35.8, H04.1, Z96.1, H26.4, nan, R11 20_30 SINGLE, Separated Female weight_category: 30_40 blood_pressure_category: Normal, '
2 weeks ago
'nan, Z47.9, M79.66, S90.3, J06.9, N39.0, M25.57, R51 nan, Z47.9, M79.66, S90.3, J06.9, N39.0, M25.57, R51 40_50 MARRIED, nan Female, Male weight_category: 90_100, 60_70 blood_pressure_category: '
2 weeks ago
AUROC: 0.7942003533272798 Confusion Matrix: [[361594 131196] [ 7290 18332]]
2 weeks ago
AUROC: 0.8073838136768197 Confusion Matrix: [[364483 128307] [ 6956 18666]]
2 weeks ago
['I10', 'I11', 'I12', 'I13', 'I15', 'I16', 'I1A ']
2 weeks ago
AUROC: 0.8500533117656335 Confusion Matrix: [[746078 211320] [ 7174 23079]]
2 weeks ago
AUROC: 0.8821627617305638 Confusion Matrix: [[803831 180300] [ 5018 19454]]
2 weeks ago
Unique disease patients in 2022: 206439 Unique disease patients in 2023: 526500 Unique new disease patients in 2023: 403090 Unique new disease patients in 2023 but exist in 2022: 82702
2 weeks ago
from sklearn.utils.class_weight import compute_sample_weight class_weights = compute_sample_weight(class_weight='balanced', y=y_train) gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42) gbc.fit(X_train, y_train, sample_weight=class_weights)
2 weeks ago
AUROC: 0.888811040480828 Confusion Matrix: [[804804 179327] [ 4773 19699]] AUPR 0.20773241092112799
2 weeks ago
Confusion Matrix: [[796889 187242] [ 5629 18843]] -------------- AUROC 0.86 ( 0.858 , 0.863 ) CI 95% AUPRC 0.155 ( 0.152 , 0.159) CI 95%
1 week ago
train_df.shape (2353406, 2) train_df.shape (2353406, 15) statistics for train Total number of patients: 2353406 Number of positive patients: 58230 (2.47%) Number of negative patients: 2295176 (97.53%) Number of male patients: 251903 (10.70%) Number of female patients: 252965 (10.75%) Mean age of patients: 24.77 Standard deviation of age: 19.55 statistics for test Total number of patients: 1008603 Number of positive patients: 24472 (2.43%) Number of negative patients: 984131 (97.57%) Number of male patients: 108317 (10.74%) Number of female patients: 108359 (10.74%) Mean age of patients: 24.75 Standard deviation of age: 19.54 end training end predictions Confusion Matrix: [[801552 182579] [ 5171 19301]] auprc : 0.20717539611916558 auroc : 0.8827933964800063 CalibratedClassifier .. Predictions and calibration curve auprc : 0.20287415215006616 auroc : 0.8828892513497076 Calibration Slope: 0.9885351667299401 Calibration Intercept: -0.0019129358148105013
1 week ago
AUROC 0.883 ( 0.882 , 0.884 ) CI 95% AUPRC 0.203 ( 0.204 , 0.21) CI 95% Setting threshold to 0.024418 the model achieves 0.798259 sensitivity threshold 0.0244181504729227 Number of high-risk patients: 210998 df_eval.shape (1008603, 6)
1 week ago
train_df.shape (2304517, 2) train_df.shape (2304517, 15) statistics for train Total number of patients: 2304517 Number of positive patients: 71598 (3.11%) Number of negative patients: 2232919 (96.89%) Number of male patients: 244523 (10.61%) Number of female patients: 245190 (10.64%) Mean age of patients: 27.12 Standard deviation of age: 18.62 statistics for test Total number of patients: 987651 Number of positive patients: 30253 (3.06%) Number of negative patients: 957398 (96.94%) Number of male patients: 104900 (10.62%) Number of female patients: 104567 (10.59%) Mean age of patients: 27.09 Standard deviation of age: 18.62 start training Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result( end training end predictions Confusion Matrix: [[739338 218060] [ 7223 23030]] auprc : 0.1887900025175293 auroc : 0.8499321037521894 CalibratedClassifier .. Predictions and calibration curve auprc : 0.18562999886445508 auroc : 0.8499755479406763 Calibration Slope: 0.9460621612396695 Calibration Intercept: 0.0002606630351691952
Please see some snippets below related to this snippet..
General
AI Verified
General
AI Verified
Added: 8 months ago
Last Updated: 8 months ago
<p>Use these to migrate data from BasePress to BetterDocs, or clone them and customize to migrate to some other structure.</p> <p>Import all 4 snippets, then run them in order. This will handle docs,...
General
AI Verified
Added: 7 months ago
Last Updated: 3 days ago
These are some popular snippets from this users codevault..
General
Unverified
Added: 3 weeks ago
Last Updated: 4 days ago
type_dic = {"PATIENT_UNIQUE_NUMBER": "str" , "Cluster" : "str", "GENDER" : "str", "AGE" : "float64", "Nationality" : "str"...
Category
Snippet Status