Unverified

Name

comandsv2

About

import json import joblib from datasets import Dataset from matplotlib import pyplot as plt from sklearn.calibration import CalibratedClassifierCV, calibration_curve from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_curve, auc import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from modeling.ClasicModels.performance_utils import plot_roc_curve, plot_pr_curve, plt_show, get_model_performance, \ get_threshold, evaluate_new, print_statistics # Function to conditionally merge dataframes based on flags def conditional_merge(base_df, merge_df, merge_flag, merge_col='PATIENT_UNIQUE_NUMBER'): if merge_flag: base_df = pd.merge(base_df, merge_df, left_on='id', right_on=merge_col, how='left') base_df.drop(columns=[merge_col], inplace=True) return base_df def read_observation_data_conditionally(file_tuple, path, flag): if flag: data = pd.read_csv(path + file_tuple[0], low_memory=False) data[file_tuple[1]] = data[file_tuple[1]].astype(str) return data return None # Flags to enable/disable merging of specific dataframes observation_data_flag = True pharmacy_observations_flag = True radiology_observations_flag = True vital_observations_flag = True speciality_and_facilitiy_observations_flag = True visit_observations_flag = True lab_observations_flag = True prediction_mode = True train_mode = False model_id = 'd' # Define file and column name tuples based on train_mode flag if train_mode: observation_path = r"C:/Users/dev1/Documents/CHI_Projects/data_v4/" patient_data = ('p_data_d_v2.csv', 'PATIENT_UNIQUE_NUMBER') observation_data = ('basic_observations.csv', 'PATIENT_UNIQUE_NUMBER') pharmacy_observations = ('Pharmacy_observations.csv', 'PATIENT_UNIQUE_NUMBER') radiology_observations = ('Radiology_observations.csv', 'PATIENT_UNIQUE_NUMBER') vital_observations = ('Vital_observations.csv', 'patient_unique_number') speciality_and_facility_observations = ('speciality_and_facility_codes_observations.csv', 'patient_unique_number') visit_observations = ('reason_for_visit_observations.csv', 'patient_unique_number') lab_observations = ('LAB_INFO_observations.csv', 'patient_unique_number') else: observation_path = r"C:/Users/dev1/Documents/CHI_Projects/data_2023/" patient_data = ('p_data_pred_d.csv', 'PATIENT_UNIQUE_NUMBER') observation_data = ('basic_observations_2023.csv', 'PATIENT_UNIQUE_NUMBER') pharmacy_observations = ('pharmacy_observations_2023.csv', 'patient_id') radiology_observations = ('radiology_observations_2023.csv', 'patient_id') vital_observations = ('vital_observations_2023.csv', 'patient_id') speciality_and_facility_observations = ('facility_speciality_observations_2023.csv', 'patient_id') visit_observations = ('visit_reason_observations_2023.csv', 'patient_id') lab_observations = ('lab_observations_2023.csv', 'patient_id') # Read the main dataframe df = pd.read_csv(observation_path + patient_data[0], low_memory=False) df['PATIENT_UNIQUE_NUMBER'] = df[patient_data[1]].astype(str) df.columns = ['id', 'label'] if len(df.columns) == 2 else ['id'] # Function to read observation data observation_data_df = read_observation_data_conditionally(observation_data, observation_path, observation_data_flag) pharmacy_observations_df = read_observation_data_conditionally(pharmacy_observations, observation_path, pharmacy_observations_flag) radiology_observations_df = read_observation_data_conditionally(radiology_observations, observation_path, radiology_observations_flag) vital_observations_df = read_observation_data_conditionally(vital_observations, observation_path, vital_observations_flag) speciality_and_facility_observations_df = read_observation_data_conditionally(speciality_and_facility_observations, observation_path, speciality_and_facilitiy_observations_flag) visit_observations_df = read_observation_data_conditionally(visit_observations, observation_path, visit_observations_flag) lab_observations_df = read_observation_data_conditionally(lab_observations, observation_path, lab_observations_flag) if train_mode: train_df, test_df = train_test_split(df, test_size=0.3, random_state=42) else: test_df = df train_df = pd.DataFrame(columns=df.columns) train_df['id'] = train_df['id'].str.strip() test_df['id'] = test_df['id'].str.strip() take_sample = False if take_sample: true_rows = train_df[train_df['label'] == 1] false_rows = train_df[train_df['label'] == 0] sample_size = min(len(true_rows), len(false_rows)) false_sample = false_rows.sample(n=sample_size, random_state=42) train_df = pd.concat([true_rows, false_sample]) # test_df = test_df.sample(frac = 0.1) print('train_df.shape', train_df.shape) train_df = conditional_merge(train_df, observation_data_df, observation_data_flag, merge_col=observation_data[1]) train_df = conditional_merge(train_df, pharmacy_observations_df, pharmacy_observations_flag, merge_col=pharmacy_observations[1]) train_df = conditional_merge(train_df, radiology_observations_df, radiology_observations_flag, merge_col=radiology_observations[1]) train_df = conditional_merge(train_df, vital_observations_df, vital_observations_flag, merge_col=vital_observations[1]) train_df = conditional_merge(train_df, speciality_and_facility_observations_df, speciality_and_facilitiy_observations_flag, merge_col=speciality_and_facility_observations[1]) train_df = conditional_merge(train_df, visit_observations_df, visit_observations_flag, merge_col=visit_observations[1]) train_df = conditional_merge(train_df, lab_observations_df, lab_observations_flag, merge_col=lab_observations[1]) print('train_df.shape', train_df.shape) test_df = conditional_merge(test_df, observation_data_df, observation_data_flag, merge_col=observation_data[1]) test_df = conditional_merge(test_df, pharmacy_observations_df, pharmacy_observations_flag, merge_col=pharmacy_observations[1]) test_df = conditional_merge(test_df, radiology_observations_df, radiology_observations_flag, merge_col=radiology_observations[1]) test_df = conditional_merge(test_df, vital_observations_df, vital_observations_flag, merge_col=vital_observations[1]) test_df = conditional_merge(test_df, speciality_and_facility_observations_df, speciality_and_facilitiy_observations_flag, merge_col=speciality_and_facility_observations[1]) test_df = conditional_merge(test_df, visit_observations_df, visit_observations_flag, merge_col=visit_observations[1]) test_df = conditional_merge(test_df, lab_observations_df, lab_observations_flag, merge_col=lab_observations[1]) print('statistics for train') print_statistics(train_df) print('statistics for test') print_statistics(test_df) train_data = Dataset.from_pandas(train_df) eval_data = Dataset.from_pandas(test_df) def preprocess_function(batch): # Extract the list of ids observation_data_columns = ['PRINICIPAL_DIAGNOSIS', 'SECONDARY_DIAGNOSIS', 'AGE_CATEGORY', 'MARITAL_STATUS', 'GENDER', 'BLOOD_GROUP'] text_parts = [] if observation_data_flag: text_parts.extend([batch[col] for col in observation_data_columns]) if pharmacy_observations_flag: text_parts.append(batch['pharamacy_observations']) if radiology_observations_flag: text_parts.append(batch['radiology_observations']) if speciality_and_facilitiy_observations_flag: text_parts.append(batch['facility_and_specialty_codes_observations']) if vital_observations_flag: text_parts.append(batch['vital_observations']) if visit_observations_flag: text_parts.append(batch['reason_for_visit']) if lab_observations_flag: text_parts.append(batch['lab_info_ovservations']) batch['text'] = ['\n'.join([str(part) if part else "" for part in parts]) for parts in zip(*text_parts)] return batch train_data = train_data.map(preprocess_function, batched=True, batch_size=1000) eval_data = eval_data.map(preprocess_function, batched=True, batch_size=1000) if train_mode: vectorizer = TfidfVectorizer() patient_history_train = vectorizer.fit_transform(train_data['text']) joblib.dump(vectorizer, f'vectorizer_{model_id}.pkl') else: vectorizer = joblib.load(f'vectorizer_{model_id}.pkl') patient_history_test = vectorizer.transform(eval_data['text']) if train_mode: print('start training') model = LogisticRegression(class_weight='balanced', C=10) model.fit(patient_history_train, train_df['label']) print('end training') joblib.dump(model, f'model_{model_id}.pkl') else: model = joblib.load(f'model_{model_id}.pkl') predictions = model.predict_proba(patient_history_test) print('end predictions') predictions = predictions[:, 1] ################################# if prediction_mode: df_eval = pd.DataFrame({'patient_id': test_df['id'], 'y_pred': predictions}) df_eval.to_csv(f'predictions_{model_id}.csv') if train_mode: cm = confusion_matrix(test_df['label'], np.round(predictions)) print("Confusion Matrix:") print(cm) ############################################### df_eval = pd.DataFrame({'patient_id': test_df['id'], 'y_truth': test_df['label'], 'y_pred': predictions}) auprc, auroc = evaluate_new(df_eval, y_truth_column_name='y_truth', y_pred_column_name='y_pred') print('auprc : ', auprc) print('auroc : ', auroc) ############ CalibratedClassifierCV ##################### print('CalibratedClassifier .. ') calibrated_clf = CalibratedClassifierCV(estimator=model, method='isotonic', cv= 'prefit') calibrated_clf.fit(patient_history_train, train_df['label']) # Predictions and calibration curve print('Predictions and calibration curve') y_pred = calibrated_clf.predict_proba(patient_history_test)[:, 1] fraction_of_positives, mean_predicted_value = calibration_curve(test_df['label'], y_pred, n_bins=10) df_eval['calibrated_pred'] = y_pred auprc, auroc = evaluate_new(df_eval, y_truth_column_name='y_truth', y_pred_column_name='calibrated_pred') print('auprc : ', auprc) print('auroc : ', auroc) # Reshape for linear regression mean_predicted_value = mean_predicted_value.reshape(-1, 1) reg = LinearRegression().fit(mean_predicted_value, fraction_of_positives) slope = reg.coef_[0] intercept = reg.intercept_ print(f"Calibration Slope: {slope}") print(f"Calibration Intercept: {intercept}") df_eval.to_csv(f'valid_predictions_{model_id}.csv')

Language

Javascript

Rating

Voted: 0 by 0 user(s)

How to Setup Snippet

import json import joblib from datasets import Dataset from matplotlib import pyplot as plt from sklearn.calibration import CalibratedClassifierCV, calibration_curve from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_curve, auc import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from modeling.ClasicModels.performance_utils import plot_roc_curve, plot_pr_curve, plt_show, get_model_performance, \ get_threshold, evaluate_new, print_statistics # Function to conditionally merge dataframes based on flags def conditional_merge(base_df, merge_df, merge_flag, merge_col='PATIENT_UNIQUE_NUMBER'): if merge_flag: base_df = pd.merge(base_df, merge_df, left_on='id', right_on=merge_col, how='left') base_df.drop(columns=[merge_col], inplace=True) return base_df def read_observation_data_conditionally(file_tuple, path, flag): if flag: data = pd.read_csv(path + file_tuple[0], low_memory=False) data[file_tuple[1]] = data[file_tuple[1]].astype(str) return data return None # Flags to enable/disable merging of specific dataframes observation_data_flag = True pharmacy_observations_flag = True radiology_observations_flag = True vital_observations_flag = True speciality_and_facilitiy_observations_flag = True visit_observations_flag = True lab_observations_flag = True prediction_mode = True train_mode = False model_id = 'd' # Define file and column name tuples based on train_mode flag if train_mode: observation_path = r"C:/Users/dev1/Documents/CHI_Projects/data_v4/" patient_data = ('p_data_d_v2.csv', 'PATIENT_UNIQUE_NUMBER') observation_data = ('basic_observations.csv', 'PATIENT_UNIQUE_NUMBER') pharmacy_observations = ('Pharmacy_observations.csv', 'PATIENT_UNIQUE_NUMBER') radiology_observations = ('Radiology_observations.csv', 'PATIENT_UNIQUE_NUMBER') vital_observations = ('Vital_observations.csv', 'patient_unique_number') speciality_and_facility_observations = ('speciality_and_facility_codes_observations.csv', 'patient_unique_number') visit_observations = ('reason_for_visit_observations.csv', 'patient_unique_number') lab_observations = ('LAB_INFO_observations.csv', 'patient_unique_number') else: observation_path = r"C:/Users/dev1/Documents/CHI_Projects/data_2023/" patient_data = ('p_data_pred_d.csv', 'PATIENT_UNIQUE_NUMBER') observation_data = ('basic_observations_2023.csv', 'PATIENT_UNIQUE_NUMBER') pharmacy_observations = ('pharmacy_observations_2023.csv', 'patient_id') radiology_observations = ('radiology_observations_2023.csv', 'patient_id') vital_observations = ('vital_observations_2023.csv', 'patient_id') speciality_and_facility_observations = ('facility_speciality_observations_2023.csv', 'patient_id') visit_observations = ('visit_reason_observations_2023.csv', 'patient_id') lab_observations = ('lab_observations_2023.csv', 'patient_id') # Read the main dataframe df = pd.read_csv(observation_path + patient_data[0], low_memory=False) df['PATIENT_UNIQUE_NUMBER'] = df[patient_data[1]].astype(str) df.columns = ['id', 'label'] if len(df.columns) == 2 else ['id'] # Function to read observation data observation_data_df = read_observation_data_conditionally(observation_data, observation_path, observation_data_flag) pharmacy_observations_df = read_observation_data_conditionally(pharmacy_observations, observation_path, pharmacy_observations_flag) radiology_observations_df = read_observation_data_conditionally(radiology_observations, observation_path, radiology_observations_flag) vital_observations_df = read_observation_data_conditionally(vital_observations, observation_path, vital_observations_flag) speciality_and_facility_observations_df = read_observation_data_conditionally(speciality_and_facility_observations, observation_path, speciality_and_facilitiy_observations_flag) visit_observations_df = read_observation_data_conditionally(visit_observations, observation_path, visit_observations_flag) lab_observations_df = read_observation_data_conditionally(lab_observations, observation_path, lab_observations_flag) if train_mode: train_df, test_df = train_test_split(df, test_size=0.3, random_state=42) else: test_df = df train_df = pd.DataFrame(columns=df.columns) train_df['id'] = train_df['id'].str.strip() test_df['id'] = test_df['id'].str.strip() take_sample = False if take_sample: true_rows = train_df[train_df['label'] == 1] false_rows = train_df[train_df['label'] == 0] sample_size = min(len(true_rows), len(false_rows)) false_sample = false_rows.sample(n=sample_size, random_state=42) train_df = pd.concat([true_rows, false_sample]) # test_df = test_df.sample(frac = 0.1) print('train_df.shape', train_df.shape) train_df = conditional_merge(train_df, observation_data_df, observation_data_flag, merge_col=observation_data[1]) train_df = conditional_merge(train_df, pharmacy_observations_df, pharmacy_observations_flag, merge_col=pharmacy_observations[1]) train_df = conditional_merge(train_df, radiology_observations_df, radiology_observations_flag, merge_col=radiology_observations[1]) train_df = conditional_merge(train_df, vital_observations_df, vital_observations_flag, merge_col=vital_observations[1]) train_df = conditional_merge(train_df, speciality_and_facility_observations_df, speciality_and_facilitiy_observations_flag, merge_col=speciality_and_facility_observations[1]) train_df = conditional_merge(train_df, visit_observations_df, visit_observations_flag, merge_col=visit_observations[1]) train_df = conditional_merge(train_df, lab_observations_df, lab_observations_flag, merge_col=lab_observations[1]) print('train_df.shape', train_df.shape) test_df = conditional_merge(test_df, observation_data_df, observation_data_flag, merge_col=observation_data[1]) test_df = conditional_merge(test_df, pharmacy_observations_df, pharmacy_observations_flag, merge_col=pharmacy_observations[1]) test_df = conditional_merge(test_df, radiology_observations_df, radiology_observations_flag, merge_col=radiology_observations[1]) test_df = conditional_merge(test_df, vital_observations_df, vital_observations_flag, merge_col=vital_observations[1]) test_df = conditional_merge(test_df, speciality_and_facility_observations_df, speciality_and_facilitiy_observations_flag, merge_col=speciality_and_facility_observations[1]) test_df = conditional_merge(test_df, visit_observations_df, visit_observations_flag, merge_col=visit_observations[1]) test_df = conditional_merge(test_df, lab_observations_df, lab_observations_flag, merge_col=lab_observations[1]) print('statistics for train') print_statistics(train_df) print('statistics for test') print_statistics(test_df) train_data = Dataset.from_pandas(train_df) eval_data = Dataset.from_pandas(test_df) def preprocess_function(batch): # Extract the list of ids observation_data_columns = ['PRINICIPAL_DIAGNOSIS', 'SECONDARY_DIAGNOSIS', 'AGE_CATEGORY', 'MARITAL_STATUS', 'GENDER', 'BLOOD_GROUP'] text_parts = [] if observation_data_flag: text_parts.extend([batch[col] for col in observation_data_columns]) if pharmacy_observations_flag: text_parts.append(batch['pharamacy_observations']) if radiology_observations_flag: text_parts.append(batch['radiology_observations']) if speciality_and_facilitiy_observations_flag: text_parts.append(batch['facility_and_specialty_codes_observations']) if vital_observations_flag: text_parts.append(batch['vital_observations']) if visit_observations_flag: text_parts.append(batch['reason_for_visit']) if lab_observations_flag: text_parts.append(batch['lab_info_ovservations']) batch['text'] = ['\n'.join([str(part) if part else "" for part in parts]) for parts in zip(*text_parts)] return batch train_data = train_data.map(preprocess_function, batched=True, batch_size=1000) eval_data = eval_data.map(preprocess_function, batched=True, batch_size=1000) if train_mode: vectorizer = TfidfVectorizer() patient_history_train = vectorizer.fit_transform(train_data['text']) joblib.dump(vectorizer, f'vectorizer_{model_id}.pkl') else: vectorizer = joblib.load(f'vectorizer_{model_id}.pkl') patient_history_test = vectorizer.transform(eval_data['text']) if train_mode: print('start training') model = LogisticRegression(class_weight='balanced', C=10) model.fit(patient_history_train, train_df['label']) print('end training') joblib.dump(model, f'model_{model_id}.pkl') else: model = joblib.load(f'model_{model_id}.pkl') predictions = model.predict_proba(patient_history_test) print('end predictions') predictions = predictions[:, 1] ################################# if prediction_mode: df_eval = pd.DataFrame({'patient_id': test_df['id'], 'y_pred': predictions}) df_eval.to_csv(f'predictions_{model_id}.csv') if train_mode: cm = confusion_matrix(test_df['label'], np.round(predictions)) print("Confusion Matrix:") print(cm) ############################################### df_eval = pd.DataFrame({'patient_id': test_df['id'], 'y_truth': test_df['label'], 'y_pred': predictions}) auprc, auroc = evaluate_new(df_eval, y_truth_column_name='y_truth', y_pred_column_name='y_pred') print('auprc : ', auprc) print('auroc : ', auroc) ############ CalibratedClassifierCV ##################### print('CalibratedClassifier .. ') calibrated_clf = CalibratedClassifierCV(estimator=model, method='isotonic', cv= 'prefit') calibrated_clf.fit(patient_history_train, train_df['label']) # Predictions and calibration curve print('Predictions and calibration curve') y_pred = calibrated_clf.predict_proba(patient_history_test)[:, 1] fraction_of_positives, mean_predicted_value = calibration_curve(test_df['label'], y_pred, n_bins=10) df_eval['calibrated_pred'] = y_pred auprc, auroc = evaluate_new(df_eval, y_truth_column_name='y_truth', y_pred_column_name='calibrated_pred') print('auprc : ', auprc) print('auroc : ', auroc) # Reshape for linear regression mean_predicted_value = mean_predicted_value.reshape(-1, 1) reg = LinearRegression().fit(mean_predicted_value, fraction_of_positives) slope = reg.coef_[0] intercept = reg.intercept_ print(f"Calibration Slope: {slope}") print(f"Calibration Intercept: {intercept}") df_eval.to_csv(f'valid_predictions_{model_id}.csv')

Codevault

mhmdateeq

Scroll down to see more snippets from this codevault.

Wordpress Compatability

The author has indicated that this snippet is compatable up to wordpress version: 6.4

Code Snippet Plugin Sync

Free & Pro

Download this snippet by clicking the download button, then head over to the Code Snippet Plugin settings in your wordpress admin dashboard, select the import menu then upload this file to import into your wordpress site.

Pro Only (Coming Soon)

You will be able to click a button and sync this snippet to your wordpress site automatically and from your dashboard manage all code snippets across all your wordpress sites that have the Code Snippets Pro plugin installed.

History

Last modified:

25/06/2024

Important Note

This snippet has the following status:

Unverified

This snippet has not been verified, use with caution and at your own risk. See details provided by author in sidebar and click below to find out more.

comandsv2

 
                    
1# Compute the subset of df for the specified conditions and label rows
2subset_df = df[(df['PATIENT_UNIQUE_NUMBER'].isin(overlapping_patients)) &
3 (~df['PATIENT_UNIQUE_NUMBER'].isin(diabetes_patients_2022))]
4 
5# Add a new column to label the rows
6subset_df['is_new_diabetic_2023'] = subset_df['PATIENT_UNIQUE_NUMBER'].apply(lambda x: x in new_diabetes_patients_2023)

0

Comments

  • 3 weeks ago

    new_diabetes_df = pd.DataFrame({
        'PATIENT_UNIQUE_NUMBER': list(new_diabetes_patients_2023),
        'label': 1
    })
  • 3 weeks ago

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  • 3 weeks ago

    AUROC:  0.6946457345970272
    Confusion Matrix:
    [[379435 258899]
     [  6425  13675]]
  • 3 weeks ago

    filtered_df = df[df['PATIENT_UNIQUE_NUMBER'].isin(new_diabetes_df['PATIENT_UNIQUE_NUMBER'])]
  • 3 weeks ago

    def string_agg(series):
        return ', '.join(series.dropna().astype(str).unique())
    
    # Group by PATIENT_UNIQUE_NUMBER and aggregate other columns
    aggregated_df = filtered_df.groupby('PATIENT_UNIQUE_NUMBER').agg(string_agg).reset_index()
  • 3 weeks ago

    columns_to_aggregate = [col for col in filtered_df.columns if col != 'PATIENT_UNIQUE_NUMBER']
    
    # Apply the aggregation function to each column separately using apply method
    aggregated_df = filtered_df.groupby('PATIENT_UNIQUE_NUMBER').apply(lambda x: pd.Series({col: string_agg(x[col]) for col in columns_to_aggregate})).reset_index()
    
  • 3 weeks ago

    df['DATE_OF_BIRTH'] = pd.to_datetime(df['DATE_OF_BIRTH'], errors='coerce')
    
    # Compute age, handle nulls by filling with NaN
    df['AGE'] = (df['DATE'] - df['DATE_OF_BIRTH']).dt.days // 365
  • 3 weeks ago

    # Create a dictionary for the aggregation functions
    agg_dict = {col: string_agg for col in columns_to_aggregate}
    
    # Group by PATIENT_UNIQUE_NUMBER and aggregate other columns using the agg_dict
    aggregated_df = filtered_df.groupby('PATIENT_UNIQUE_NUMBER').agg(agg_dict).reset_index()
  • 3 weeks ago

    df['DATE'] = pd.to_datetime(df['DATE'])
    df['DATE_OF_BIRTH'] = pd.to_datetime(df['DATE_OF_BIRTH'], errors='coerce')
    
    # Compute age, handle nulls by filling with NaN
    df['AGE'] = (df['DATE'] - df['DATE_OF_BIRTH']).dt.days // 365
    
    # Filter rows where AGE is greater than 18 or AGE is NaN
    filtered_df = df[(df['AGE'] > 18) | df['AGE'].isna()]
  • 3 weeks ago

    # Categorize age into intervals
    bins = [18, 20, 30, 40, 50, 60, 70, 80, 90, 100, np.inf]
    labels = ["18_20", "20_30", "30_40", "40_50", "50_60", "60_70", "70_80", "80_90", "90_100", "100+"]
    filtered_df['AGE_CATEGORY'] = pd.cut(filtered_df['AGE'], bins=bins, labels=labels, right=False)
    
  • 3 weeks ago

    df = df.drop(columns=['DATE_OF_BIRTH'])
  • 3 weeks ago

    batch['text'] = [c1 + " " + c2 for c1, c2 in zip(batch['column1'], batch['column2'])]
  • 3 weeks ago

     df['AGE'] = (df['DATE'] - df['DATE_OF_BIRTH']).dt.days // 365
    OverflowError: Overflow in int64 addition
  • 3 weeks ago

    def calculate_age(birth_date, current_date):
        try:
            # Check for invalid dates
            if pd.isnull(birth_date) or pd.isnull(current_date):
                return None
            # Calculate the difference in years
            age = current_date.year - birth_date.year
            # Adjust the age if the current date is before the birth date in the current year
            if (current_date.month, current_date.day) < (birth_date.month, birth_date.day):
                age -= 1
            return age
        except Exception as e:
            return None
    
    # Apply the function to the DataFrame
    df['AGE'] = df.apply(lambda row: calculate_age(row['DATE_OF_BIRTH'], row['DATE']), axis=1)
  • 3 weeks ago

    merged_df = pd.merge(df1, df2, on=['key1', 'key2'], how='inner')
  • 3 weeks ago

    code_dict = {
        1: 'Single',
        2: 'Married',
        3: 'Separated',
        4: 'Divorced',
        5: 'Widowed',
        88: 'Others',
        99: 'Unknown'
    }
    
    # Cast 'code' column to numeric, coercing errors to NaN
    df['code'] = pd.to_numeric(df['code'], errors='coerce').astype('Int64')
    
    # Mapping the 'code' column to the 'value' column
    df['value'] = df['code'].map(code_dict)
    
    # Dropping the 'code' column
    df.drop('code', axis=1, inplace=True)
  • 3 weeks ago

    Unique patients in 2022: 3330513
    Unique encounters in 2022: 9690628
    Unique patients in 2023: 7227451
    Unique encounters in 2023: 21893534
    Unique overlapping patients in 2022/2023: 1958711
    Unique diabetes patients in 2022: 268496
    Unique diabetes patients in 2023: 659478
    Unique new diabetes patients in 2023: 478638
    Unique new diabetes patients in 2023 but exist in 2022: 85817
  • 3 weeks ago

    def preprocess_function(batch):
        # Extract the list of ids
        #ids = examples['id']
        batch['text'] = [c1 if c1 else "" + " " + c2 if c2 else "" + " " + c3 if c3 else ""+ " " + c4 if c4 else ""
                         + " " + c5 if c5 else "" + " " + c6 if c6 else ""
                         for c1, c2, c3, c4, c5, c6 in zip(batch['PRINICIPAL_DIAGNOSIS'], batch['SECONDARY_DIAGNOSIS'],batch['AGE_CATEGORY'],batch['MARITAL_STATUS'],batch['GENDER'],batch['BLOOD_GROUP'] )]
        print(batch['text'])
        return batch
    
  • 3 weeks ago

    AUROC:  0.7870992018148124
    Confusion Matrix:
    [[362472 130318]
     [  7610  18012]]
  • 3 weeks ago

    lightgbm
  • 3 weeks ago

    import shap
  • 2 weeks ago

    def string_agg(series):
        return ', '.join(series.dropna().astype(str).unique())
    
    columns_to_aggregate = [col for col in filtered_df.columns if col in ['PRINICIPAL_DIAGNOSIS','SECONDARY_DIAGNOSIS', 'AGE_CATEGORY', 'MARITAL_STATUS','BLOOD_GROUP','GENDER']]
    
    # Create a dictionary for the aggregation functions
    agg_dict = {col: string_agg for col in columns_to_aggregate}
    
    # Group by PATIENT_UNIQUE_NUMBER and aggregate other columns using the agg_dict
    aggregated_df = filtered_df.groupby('PATIENT_UNIQUE_NUMBER').agg(agg_dict).reset_index()
  • 2 weeks ago

    df_2022 = df[df['DATE'].dt.year == 2022]
    
  • 2 weeks ago

    # Categorize age into intervals
    bins = [18, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150]
    labels = ["18_20", "20_30", "30_40", "40_50", "50_60", "60_70", "70_80", "80_90", "90_100", "100+"]
    df['AGE_CATEGORY'] = pd.cut(df['AGE'], bins=bins, labels=labels, right=False)
    
    df = df.drop(columns=['AGE'])
  • 2 weeks ago

    '
    
    30_40
    
    
    
    weight_category:  
     blood_pressure_category: '
  • 2 weeks ago

    'Z36
    
    30_40
    
    
    
    weight_category: 60_70 
     blood_pressure_category: Prehypertension'
  • 2 weeks ago

    'S51, Z48.0, Z48, S60
    
    30_40
    
    
    
    weight_category: 60_70 
     blood_pressure_category: Hypertension, Prehypertension'
  • 2 weeks ago

    'nan, R07.3, R51, R07.4, M79.66, S93.40
    nan, R07.3, R51, R07.4, M79.66, S93.40
    18_20, 40_50, 20_30
    nan, MARRIED, SINGLE
    Male, Female
    
    weight_category: 50_60, 60_70 
     blood_pressure_category: '
  • 2 weeks ago

    'B21, nan
    nan, Z01.9
    50_60, 40_50
    Married
    Male
    Unspecified, nan
    weight_category: 30_40, 60_70 
     blood_pressure_category: , Low, Prehypertension'
  • 2 weeks ago

    'H26.9, H35.8, H04.1, Z96.1, H26.4, nan, R11
    H26.9, H35.8, H04.1, Z96.1, H26.4, nan, R11
    20_30
    SINGLE, Separated
    Female
    
    weight_category: 30_40 
     blood_pressure_category: Normal, '
  • 2 weeks ago

    'nan, Z47.9, M79.66, S90.3, J06.9, N39.0, M25.57, R51
    nan, Z47.9, M79.66, S90.3, J06.9, N39.0, M25.57, R51
    40_50
    MARRIED, nan
    Female, Male
    
    weight_category: 90_100, 60_70 
     blood_pressure_category: '
  • 2 weeks ago

    AUROC:  0.7942003533272798
    Confusion Matrix:
    [[361594 131196]
     [  7290  18332]]
  • 2 weeks ago

    AUROC:  0.8073838136768197
    Confusion Matrix:
    [[364483 128307]
     [  6956  18666]]
  • 2 weeks ago

    ['I10', 'I11', 'I12', 'I13', 'I15', 'I16', 'I1A ']
  • 2 weeks ago

    AUROC:  0.8500533117656335
    Confusion Matrix:
    [[746078 211320]
     [  7174  23079]]
  • 2 weeks ago

    AUROC:  0.8821627617305638
    Confusion Matrix:
    [[803831 180300]
     [  5018  19454]]
  • 2 weeks ago

    Unique disease patients in 2022: 206439
    Unique disease patients in 2023: 526500
    Unique new disease patients in 2023: 403090
    Unique new disease patients in 2023 but exist in 2022: 82702
  • 2 weeks ago

    from sklearn.utils.class_weight import compute_sample_weight
    class_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    gbc.fit(X_train, y_train, sample_weight=class_weights)
    
  • 2 weeks ago

    AUROC:  0.888811040480828
    Confusion Matrix:
    [[804804 179327]
     [  4773  19699]]
    AUPR 0.20773241092112799
  • 2 weeks ago

    Confusion Matrix:
    [[796889 187242]
     [  5629  18843]]
    --------------
    AUROC 0.86 ( 0.858 , 0.863 ) CI 95%
    AUPRC 0.155 ( 0.152 , 0.159) CI 95%
  • 1 week ago

    train_df.shape (2353406, 2)
    train_df.shape (2353406, 15)
    statistics for train
    Total number of patients: 2353406
    Number of positive patients: 58230 (2.47%)
    Number of negative patients: 2295176 (97.53%)
    Number of male patients: 251903 (10.70%)
    Number of female patients: 252965 (10.75%)
    Mean age of patients: 24.77
    Standard deviation of age: 19.55
    statistics for test
    Total number of patients: 1008603
    Number of positive patients: 24472 (2.43%)
    Number of negative patients: 984131 (97.57%)
    Number of male patients: 108317 (10.74%)
    Number of female patients: 108359 (10.74%)
    Mean age of patients: 24.75
    Standard deviation of age: 19.54
    end training
    end predictions
    Confusion Matrix:
    [[801552 182579]
     [  5171  19301]]
    auprc :  0.20717539611916558
    auroc :  0.8827933964800063
    CalibratedClassifier .. 
    Predictions and calibration curve
    auprc :  0.20287415215006616
    auroc :  0.8828892513497076
    Calibration Slope: 0.9885351667299401
    Calibration Intercept: -0.0019129358148105013
    
    
    
  • 1 week ago

    AUROC 0.883 ( 0.882 , 0.884 ) CI 95%
    AUPRC 0.203 ( 0.204 , 0.21) CI 95%
    Setting threshold to 0.024418 the model achieves 0.798259 sensitivity
    threshold 0.0244181504729227
    Number of high-risk patients: 210998
    df_eval.shape (1008603, 6)
  • 1 week ago

    train_df.shape (2304517, 2)
    train_df.shape (2304517, 15)
    statistics for train
    Total number of patients: 2304517
    Number of positive patients: 71598 (3.11%)
    Number of negative patients: 2232919 (96.89%)
    Number of male patients: 244523 (10.61%)
    Number of female patients: 245190 (10.64%)
    Mean age of patients: 27.12
    Standard deviation of age: 18.62
    statistics for test
    Total number of patients: 987651
    Number of positive patients: 30253 (3.06%)
    Number of negative patients: 957398 (96.94%)
    Number of male patients: 104900 (10.62%)
    Number of female patients: 104567 (10.59%)
    Mean age of patients: 27.09
    Standard deviation of age: 18.62
    start training
    Increase the number of iterations (max_iter) or scale the data as shown in:
        https://scikit-learn.org/stable/modules/preprocessing.html
    Please also refer to the documentation for alternative solver options:
        https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
      n_iter_i = _check_optimize_result(
    end training
    end predictions
    Confusion Matrix:
    [[739338 218060]
     [  7223  23030]]
    auprc :  0.1887900025175293
    auroc :  0.8499321037521894
    CalibratedClassifier .. 
    Predictions and calibration curve
    auprc :  0.18562999886445508
    auroc :  0.8499755479406763
    Calibration Slope: 0.9460621612396695
    Calibration Intercept: 0.0002606630351691952
    
    

Related Snippets

Please see some snippets below related to this snippet..

General

AI Verified

0

YOKA

Added: 3 months ago

Last Updated: 3 months ago

kode di bawah ini saya belajar

General

AI Verified

0

Migrate BasePress -> BetterDocs: 1. Docs, Categories, Tags & Meta

Added: 8 months ago

Last Updated: 8 months ago

<p>Use these to migrate data from BasePress to BetterDocs, or clone them and customize to migrate to some other structure.</p> <p>Import all 4 snippets, then run them in order. This will handle docs,...

General

AI Verified

Other Snippets in this Codevault

These are some popular snippets from this users codevault..

General

Unverified

0

adasdas

Added: 4 weeks ago

Last Updated: 4 weeks ago

General

Unverified

0

test

Added: 3 weeks ago

Last Updated: 4 days ago

type_dic = {"PATIENT_UNIQUE_NUMBER": "str" , "Cluster" : "str", "GENDER" : "str", "AGE" : "float64", "Nationality" : "str"...

General

Unverified

0

gpt

Added: 3 weeks ago

Last Updated: 1 week ago

import pandas as pd # Sample dataframe data = { 'PATIENT_UNIQUE_NUMBER': [1, 2, 1, 3, 1, 2, 3, 4, 4], 'has_disease': [True, False, True, True, False, True, False, True, True] } df = pd...