STAVER’s robustness and applicability in disease diagnosis and classification.¶

To validate the outstanding advantages of the STAVER algorithm for potential clinical decision-making, we comprehensively evaluated the generalization performance and prediction accuracy of classification models constructed based on the above tumor biomarkers.

To illustrate the robustness and broad applicability of the STAVER algorithm, we constructed three separate benchmark classification models based on different algorithms, including decision tree-based random forest, gradient boosting-based XGBoost, and linear regression-based logistic regression models (Materials and methods section).

As a result, compared to the original proteomic data, the classification models constructed based on STAVER-processed data exhibited increased generalization performance in distinguishing specific cancer-type cell lines.

[1]:

import numpy as np
import pandas as pd
import xgboost as xgb
import itertools
import scipy.stats as st
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from scipy.stats import rankdata
from sklearn.metrics import roc_auc_score
from scipy.stats import norm
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# python matplotlib export editable PDF
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['figure.dpi']= 150

custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

import warnings
warnings.filterwarnings('ignore')

Define the DeLong’s test function and test it¶

[2]:

class DelongTest:
    def __init__(self, preds1, preds2, label, threshold=0.05):
        """
        Initialize the Delong Test.

        Args:
            preds1: Prediction probabilities from the first model.
            preds2: Prediction probabilities from the second model.
            label: Actual labels (ground truth).
            threshold: Significance level for the p-value. Default is 0.05.

        Returns:
            z: The z-score for the two-tailed test.
            p: The p-value for the two-tailed test.
        """
        self._preds1 = preds1
        self._preds2 = preds2
        self._label = label
        self.threshold = threshold
        self.z, self.p = self._compute_z_p()  # Directly compute and store results on initialization
        self._show_result()

    def _auc(self, X, Y):
        """Compute the AUC using the Mann-Whitney statistic."""
        return 1 / (len(X) * len(Y)) * sum([self._kernel(x, y) for x in X for y in Y])

    @staticmethod
    def _kernel(X, Y):
        """Kernel function to compute the Mann-Whitney statistic."""
        return 0.5 if Y == X else int(Y < X)

    def _structural_components(self, X, Y):
        """Compute the structural components used for the covariance calculation."""
        V10 = [1 / len(Y) * sum([self._kernel(x, y) for y in Y]) for x in X]
        V01 = [1 / len(X) * sum([self._kernel(x, y) for x in X]) for y in Y]
        return V10, V01

    def _get_S_entry(self, V_A, V_B, auc_A, auc_B):
        """Calculate an entry of the covariance matrix."""
        return 1 / (len(V_A) - 1) * sum([(a - auc_A) * (b - auc_B) for a, b in zip(V_A, V_B)])

    def _z_score(self, var_A, var_B, covar_AB, auc_A, auc_B):
        """Calculate the z-score for the two-tailed test."""
        return (auc_A - auc_B) / np.sqrt(var_A + var_B - 2 * covar_AB + 1e-8)

    def _group_preds_by_label(self, preds, actual):
        """Group predictions by their actual label."""
        X = [p for (p, a) in zip(preds, actual) if a]
        Y = [p for (p, a) in zip(preds, actual) if not a]
        return X, Y

    def _compute_z_p(self):
        """Main function to compute the z-score and p-value for the DeLong test."""
        X_A, Y_A = self._group_preds_by_label(self._preds1, self._label)
        X_B, Y_B = self._group_preds_by_label(self._preds2, self._label)

        V_A10, V_A01 = self._structural_components(X_A, Y_A)
        V_B10, V_B01 = self._structural_components(X_B, Y_B)

        auc_A = self._auc(X_A, Y_A)
        auc_B = self._auc(X_B, Y_B)

        # Compute entries of the covariance matrix
        var_A = self._get_S_entry(V_A10, V_A10, auc_A, auc_A) / len(V_A10) + self._get_S_entry(V_A01, V_A01, auc_A, auc_A) / len(V_A01)
        var_B = self._get_S_entry(V_B10, V_B10, auc_B, auc_B) / len(V_B10) + self._get_S_entry(V_B01, V_B01, auc_B, auc_B) / len(V_B01)
        covar_AB = self._get_S_entry(V_A10, V_B10, auc_A, auc_B) / len(V_A10) + self._get_S_entry(V_A01, V_B01, auc_A, auc_B) / len(V_A01)

        # Calculate z-score
        z = self._z_score(var_A, var_B, covar_AB, auc_A, auc_B)
        p = st.norm.sf(abs(z)) * 2  # Two-tailed p-value

        return z, p

    def _show_result(self):
        """Print the results of the test."""
        print(f"z score = {self.z:.5f};\np value = {self.p:.5f};")
        print("There is a significant difference" if self.p < self.threshold else "There is NO significant difference")

    def get_results(self):
        """Return the z-score and p-value for the test."""
        return self.z, self.p


def generate_data():
    X, y = make_classification(
        n_samples=1000,  # Number of samples
        n_features=10,  # Number of features
        n_informative=5,  # Number of informative features
        n_redundant=0,  # Number of redundant features
        n_clusters_per_class=1,  # Number of clusters per class
        random_state=42
    )
    return X, y


def test_delong_fuction():

    X, y = generate_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # 第一个模型
    dtrain1 = xgb.DMatrix(X_train, label=y_train)
    bst1 = xgb.train({}, dtrain1)
    dtest = xgb.DMatrix(X_test)
    y_pred_prob1 = bst1.predict(dtest)

    # 第二个模型（假设参数稍有变化，实际应根据需求修改）
    dtrain2 = xgb.DMatrix(X_train, label=y_train)
    bst2 = xgb.train({'max_depth': 3}, dtrain2)
    y_pred_prob2 = bst2.predict(dtest)

    # DeLong检验
    delong = DelongTest(y_pred_prob1, y_pred_prob2, y_test)
    z, p_value = delong.get_results()
    print(f"P-value from DeLong's test: {p_value:.5f}")

    # ROC curve and AUC
    fpr1, tpr1, _ = roc_curve(y_test, y_pred_prob1)
    roc_auc1 = auc(fpr1, tpr1)
    fpr2, tpr2, _ = roc_curve(y_test, y_pred_prob2)
    roc_auc2 = auc(fpr2, tpr2)

    plt.figure(figsize=(5, 4))
    plt.plot(fpr1, tpr1, color='blue', label='Model 1 (AUC = %0.2f)' % roc_auc1)
    plt.plot(fpr2, tpr2, color='red', label='Model 2 (AUC = %0.2f)' % roc_auc2)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic (ROC)\nDeLong Test p-value: {p_value:.5f}')
    plt.legend(loc="lower right")
    plt.show()


if __name__ == '__main__':
    # Test the Delong test function
    test_delong_fuction()

z score = -1.52065;
p value = 0.12835;
There is NO significant difference
P-value from DeLong's test: 0.12835

_images/PanCancer_949_cellline_models_3_1.png

Define the benchmark models¶

Gradient boosting-based XGBoost
Decision tree-based random forest
Linear regression-based logistic regression

[3]:

class ModelEvaluator_CM:
    def __init__(self, data1, data2, target, k_fold = 5, random_state=42, outpath=None):
        self.datasets = {'Dataset 1': data1, 'Dataset 2': data2}
        self.target = target
        self.k_fold = k_fold
        self.random_state = random_state
        self.outpath = outpath
        self.models = {
            # 'XGBoost': xgb.XGBClassifier(max_depth=4,
            #                             learning_rate=0.2,
            #                             reg_lambda=1,
            #                             n_estimators=150,
            #                             subsample=0.9,  # original 0.9
            #                             colsample_bytree=0.9,),
            'XGBoost': xgb.XGBClassifier(n_estimators=150),
            'Random Forest': RandomForestClassifier(),
            'Logistic Regression': LogisticRegression()
        }
        self.results = {}

    def train_models(self):
            skf = StratifiedKFold(n_splits=self.k_fold, shuffle=True, random_state=self.random_state)

            for name, data in self.datasets.items():
                y = data[self.target]
                X = data.drop(columns=[self.target])

                self.results[name] = {}

                for model_name, model in self.models.items():
                    y_preds = []
                    y_tests = []

                    for train_index, test_index in skf.split(X, y):
                        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                        if model_name == 'XGBoost':
                            dtrain = xgb.DMatrix(X_train, label=y_train)
                            model_instance = xgb.train({}, dtrain)
                            dtest = xgb.DMatrix(X_test)
                            y_pred = model_instance.predict(dtest)
                        else:
                            model_instance = model.fit(X_train, y_train)
                            y_pred = model_instance.predict_proba(X_test)[:, 1]

                        y_preds.extend(y_pred)
                        y_tests.extend(y_test.tolist())

                    self.results[name][model_name] = (y_tests, y_preds)

    def plot_rocs(self):
            n_models = len(self.models)
            fig, axes = plt.subplots(2, n_models, figsize=(4.2 * n_models, 12))  # 修改此处为3行

            for i, model_name in enumerate(self.models.keys()):
                for dataset_name, dataset_results in self.results.items():
                    y_test, y_pred = dataset_results[model_name]

                    # ROC curve
                    fpr, tpr, _ = roc_curve(y_test, y_pred)
                    roc_auc = auc(fpr, tpr)
                    # axes[0, i].plot(fpr, tpr, label=f'{dataset_name} (AUC = {roc_auc:.2f})')

                    # DeLong Test
                    delong = DelongTest(self.results['Dataset 1'][model_name][1],
                                        self.results['Dataset 2'][model_name][1],
                                        self.results['Dataset 1'][model_name][0])
                    _, p_value = delong.get_results()

                    # Confusion Matrix for each dataset
                    y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]
                    cm = confusion_matrix(y_test, y_pred_binary)
                    print(f'Confusion Matrix of {model_name} on {dataset_name}:')
                    print(cm)
                    print('\n')
                    self.plot_confusion_matrix_on_axes(cm, ['Negative', 'Positive'], axes[0 if dataset_name == 'Dataset 1' else 1, i], title=f'CM of {model_name} on {dataset_name}')

                # axes[0, i].plot([0, 1], [0, 1], color='navy', linestyle='--')
                # axes[0, i].set_xlabel('False Positive Rate')
                # axes[0, i].set_ylabel('True Positive Rate')
                # axes[0, i].set_title(f'{model_name}\nDeLong Test p-value: {p_value:.5f}')
                # axes[0, i].legend(loc="lower right")

            plt.tight_layout()
            if self.outpath:
                plt.savefig(self.outpath)
            plt.show()

    def plot_confusion_matrix_on_axes(self, cm, classes, ax, title='Confusion Matrix', cmap=plt.cm.Blues):
        ax.imshow(cm, interpolation='nearest', cmap=cmap)
        ax.set_title(title)
        tick_marks = np.arange(len(classes))
        ax.set_xticks(tick_marks)
        ax.set_xticklabels(classes, rotation=45)
        ax.set_yticks(tick_marks)
        ax.set_yticklabels(classes)

        fmt = 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            ax.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        ax.set_ylabel('True label')
        ax.set_xlabel('Predicted label')


class ModelEvaluator_1:
    def __init__(self, data1, data2, target, k_fold = 5, random_state=42, outpath=None):
        self.datasets = {'Dataset 1': data1, 'Dataset 2': data2}
        self.target = target
        self.k_fold = k_fold
        self.random_state = random_state
        self.outpath = outpath
        self.models = {
            # 'XGBoost': xgb.XGBClassifier(max_depth=4,
            #                             learning_rate=0.2,
            #                             reg_lambda=1,
            #                             n_estimators=150,
            #                             subsample=0.9,  # original 0.9
            #                             colsample_bytree=0.9,),
            'XGBoost': xgb.XGBClassifier(n_estimators=150),
            'Random Forest': RandomForestClassifier(),
            'Logistic Regression': LogisticRegression()
        }
        self.results = {}

    def train_models(self):
            skf = StratifiedKFold(n_splits=self.k_fold, shuffle=True, random_state=self.random_state)

            for name, data in self.datasets.items():
                y = data[self.target]
                X = data.drop(columns=[self.target])

                self.results[name] = {}

                for model_name, model in self.models.items():
                    y_preds = []
                    y_tests = []

                    for train_index, test_index in skf.split(X, y):
                        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                        if model_name == 'XGBoost':
                            dtrain = xgb.DMatrix(X_train, label=y_train)
                            model_instance = xgb.train({}, dtrain)
                            dtest = xgb.DMatrix(X_test)
                            y_pred = model_instance.predict(dtest)
                        else:
                            model_instance = model.fit(X_train, y_train)
                            y_pred = model_instance.predict_proba(X_test)[:, 1]

                        y_preds.extend(y_pred)
                        y_tests.extend(y_test.tolist())

                    self.results[name][model_name] = (y_tests, y_preds)

    def plot_rocs(self):
        n_models = len(self.models)
        fig, axes = plt.subplots(1, n_models, figsize=(4.3 * n_models, 4))

        for ax, model_name in zip(axes, self.models.keys()):
            for dataset_name, dataset_results in self.results.items():
                y_test, y_pred = dataset_results[model_name]
                fpr, tpr, _ = roc_curve(y_test, y_pred)
                roc_auc = auc(fpr, tpr)
                # 95% CI for roc_auc
                auc_scores = []
                for _ in range(1000):
                    idx = np.random.randint(0, len(y_test), len(y_test))
                    auc_scores.append(roc_auc_score(np.array(y_test)[idx], np.array(y_pred)[idx]))
                auc_scores = np.array(auc_scores)
                auc_scores.sort()
                auc_lower = auc_scores[int(0.025 * len(auc_scores))]
                auc_upper = auc_scores[int(0.975 * len(auc_scores))]
                ax.fill_between(fpr, tpr, alpha=0.3)
                ax.plot(fpr, tpr, label=f'{dataset_name} (AUC = {roc_auc:.2f} [{auc_lower:.2f}, {auc_upper:.2f}])')
                ax.plot(fpr, tpr, label=f'{model_name} on {dataset_name} (AUC = {roc_auc:.2f})')

            delong = DelongTest(self.results['Dataset 1'][model_name][1],
                                self.results['Dataset 2'][model_name][1],
                                self.results['Dataset 1'][model_name][0])
            _, p_value = delong.get_results()

            ax.plot([0, 1], [0, 1], color='navy', linestyle='--')
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.set_title(f'{model_name}\nDeLong Test p-value: {p_value:.5f}')
            ax.legend(loc="lower right")

        plt.tight_layout()
        if self.outpath:
            plt.savefig(self.outpath)
        plt.show()

Define the benchmark models with 95% CI AUC scores¶

[4]:

class ModelEvaluator:
    def __init__(self, data1, data2, target, k_fold=5, random_state=42, outpath=None):
        self.datasets = {'Original data': data1, 'STAVER data': data2}
        self.target = target
        self.k_fold = k_fold
        self.random_state = random_state
        self.outpath = outpath
        self.models = {
            'XGBoost': xgb.XGBClassifier(n_estimators=150),
            'Random Forest': RandomForestClassifier(),
            'Logistic Regression': LogisticRegression()
        }
        self.results = {}

    def train_models(self):
        skf = StratifiedKFold(n_splits=self.k_fold, shuffle=True, random_state=self.random_state)
        for name, data in self.datasets.items():
            y = data[self.target]
            X = data.drop(columns=[self.target])
            self.results[name] = {}
            for model_name, model in self.models.items():
                y_preds = []
                y_tests = []
                for train_index, test_index in skf.split(X, y):
                    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                    if model_name == 'XGBoost':
                        dtrain = xgb.DMatrix(X_train, label=y_train)
                        model_instance = xgb.train({}, dtrain)
                        dtest = xgb.DMatrix(X_test)
                        y_pred = model_instance.predict(dtest)
                    else:
                        model_instance = model.fit(X_train, y_train)
                        y_pred = model_instance.predict_proba(X_test)[:, 1]
                    y_preds.extend(y_pred)
                    y_tests.extend(y_test.tolist())
                self.results[name][model_name] = (y_tests, y_preds)

    def plot_rocs(self):
        n_models = len(self.models)
        fig, axes = plt.subplots(1, n_models, figsize=(4.3 * n_models, 4))
        for ax, model_name in zip(axes, self.models.keys()):
            for dataset_name, dataset_results in self.results.items():
                y_test, y_pred = dataset_results[model_name]
                fpr, tpr, _ = roc_curve(y_test, y_pred)
                roc_auc = auc(fpr, tpr)
                # 95% CI for AUC using bootstrapping (1000 samples)
                auc_scores = []
                for _ in range(1000):
                    idx = np.random.randint(0, len(y_test), len(y_test))
                    auc_scores.append(roc_auc_score(np.array(y_test)[idx], np.array(y_pred)[idx]))
                auc_scores = np.array(auc_scores)
                auc_scores.sort()
                auc_lower = auc_scores[int(0.025 * len(auc_scores))]
                auc_upper = auc_scores[int(0.975 * len(auc_scores))]
                if self.auc_scale:
                    # Print the 95% CI AUC
                    print(f'{model_name} on {dataset_name}: AUC = {roc_auc:.2f} ({auc_lower:.2f}-{auc_upper:.2f})')
                    # ax.fill_between(fpr, tpr, alpha=0.3)
                    ax.plot(fpr, tpr, label=f'{dataset_name} (AUC = {roc_auc:.2f} [{auc_lower:.2f}-{auc_upper:.2f}])')
                else:
                    # Print the 95% CI AUC
                    print(f'{model_name} on {dataset_name}: AUC = {roc_auc:.2f} ({auc_lower:.2f}-{auc_upper:.2f})')
                    # ax.fill_between(fpr, tpr, alpha=0.3)
                    ax.plot(fpr, tpr, label=f'{dataset_name} (AUC = {roc_auc:.2f} [{auc_lower:.2f}-{auc_upper:.2f}])')
                # ax.plot(fpr, tpr, label=f'{model_name} on {dataset_name} (AUC = {roc_auc:.2f})')
                # ax.plot(fpr, tpr, label=f'{model_name} on {dataset_name} (AUC = {roc_auc:.2f})')
            delong = DelongTest(self.results['Original data'][model_name][1],
                                self.results['STAVER data'][model_name][1],
                                self.results['Original data'][model_name][0])
            _, p_value = delong.get_results()
            print("\n")
            ax.plot([0, 1], [0, 1], color='navy', linestyle='--')
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.set_title(f'{model_name}\nDeLong Test p-value: {p_value:.5f}')
            ax.legend(loc="lower right")
        plt.tight_layout()
        if self.outpath:
            plt.savefig(self.outpath)
        plt.show()

Load the Original data and the STAVER-processed data¶

[5]:

# Load data
STAVER = pd.read_csv("~/STAVER-revised/model-evlauate/STAVER_data.csv", index_col=0)
raw_data = pd.read_csv("~/STAVER-revised/model-evlauate/raw_data.csv", index_col=0)

[6]:

def handle_missing_value(data, condition, columns):
    """Handele the missing value for a given columns."""
    subset = data[condition].loc[:, columns]
    subset.replace(0, np.nan, inplace=True)
    subset.fillna(subset.mean(), inplace=True)
    data.loc[condition, columns] = subset
    return data

def extract_data(data, target="Cancer_type", genes=None, cancer_types=None, handle_genes=None, handle_target_na=False, handle_genes_na=False):
    data = data.copy()

    if genes is None:
        genes = []
    # Include the target in the genes list without modifying the original genes list
    all_columns = genes + [target]
    # Filter columns based on the genes list
    data = data[all_columns]

    # Remove the target from the genes list for subsequent operations
    genes = [gene for gene in genes if gene != target]

    if cancer_types is not None:
        if len(cancer_types) > 1:  # If there are multiple cancer_types, filter rows based on them
            data = data[data[target].isin(cancer_types)]
        elif len(cancer_types) == 1:  # If there's only one cancer_type
            data.loc[data[target] != cancer_types[0], target] = 'The rest.'

    if handle_target_na:
        condition = (data[target] == cancer_types[0])
        data = handle_missing_value(data, condition, handle_genes)

    elif handle_genes_na:
        condition1 = (data[target] == cancer_types[0])
        condition2 = (data[target] != cancer_types[0])
        data = handle_missing_value(data, condition1, handle_genes)
        data = handle_missing_value(data, condition2, handle_genes)

    return data

def label_encode(data, target="Cancer_type"):
    data = data.copy()
    data[target] = data[target].astype('category')
    data[target] = data[target].cat.codes
    return data

def process_data(data, target="Cancer_type", genes=None, cancer_types=None, handle_genes=None, handle_target_na = False, handle_genes_na = False):
    data = extract_data(data, target, genes, cancer_types, handle_genes, handle_target_na, handle_genes_na)
    data = label_encode(data, target=target)
    return data

def load_data(target="Cancer_type", genes=None, cancer_types=None, handle_genes = None,
            handle_target_na = False, handle_genes_na = False, replace_dataset1 = False, replace_dataset2 = False):
    # STAVER = pd.read_csv("~/STAVER-revised/model-evlauate/STAVER_data.csv", index_col=0)
    # raw_data = pd.read_csv("/~STAVER-revised/model-evlauate/raw_data.csv", index_col=0)
    if replace_dataset1:
        STAVER_model = process_data(STAVER, target, genes, cancer_types, handle_genes, handle_target_na, handle_genes_na)
    else:
        STAVER_model = process_data(STAVER, target, genes, cancer_types)
    if replace_dataset2:
        raw_data_model = process_data(raw_data, target, genes, cancer_types, handle_genes, handle_target_na, handle_genes_na)
    else:
        raw_data_model = process_data(raw_data, target, genes, cancer_types)
    return STAVER_model, raw_data_model

Train the benchmark models based on the reported cancer biomarkers¶

Hepatocellular Carcinoma biomarkers¶

[66]:

STAVER_model, raw_data_model = load_data(target="Cancer_type",
                                        genes=["HSPA4","GGT1","DCP","A1CF"],
                                        cancer_types=["Hepatocellular Carcinoma"],
                                        )

outpath = "~/DIA-STAVER/949-cell-lines-1/model-evaluate/benchmark_models_based_on_HCC_markers.pdf"
evaluator = ModelEvaluator(raw_data_model, STAVER_model,
                            target='Cancer_type',
                            k_fold=5,
                            random_state=42,
                            outpath=outpath
                            )
evaluator.train_models()
evaluator.plot_rocs()

XGBoost on Original data: AUC = 0.84 (0.79-0.89)
XGBoost on STAVER data: AUC = 0.96 (0.95-0.96)
z score = -4.19364;
p value = 0.00003;
There is a significant difference


Random Forest on Original data: AUC = 0.86 (0.81-0.90)
Random Forest on STAVER data: AUC = 0.96 (0.95-0.96)
z score = -4.54820;
p value = 0.00001;
There is a significant difference


Logistic Regression on Original data: AUC = 0.80 (0.77-0.84)
Logistic Regression on STAVER data: AUC = 0.92 (0.91-0.94)
z score = -7.61224;
p value = 0.00000;
There is a significant difference

_images/PanCancer_949_cellline_models_13_1.png

[92]:

STAVER_model, raw_data_model = load_data(target="Cancer_type",
                                        genes=["HSPA4","GGT1","DCP","A1CF"],
                                        cancer_types=["Hepatocellular Carcinoma"],
                                        )

outpath = "~/DIA-STAVER/949-cell-lines-1/model-evaluate/benchmark_models_based_on_HCC_markers_CM.pdf"
evaluator = ModelEvaluator_CM(raw_data_model, STAVER_model,
                            target='Cancer_type',
                            k_fold=5,
                            random_state=42,
                            outpath=outpath
                            )
evaluator.train_models()
evaluator.plot_rocs()

z score = -4.19364;
p value = 0.00003;
There is a significant difference
Confusion Matrix of XGBoost on Dataset 1:
[[  36   39]
 [  11 1156]]


z score = -4.19364;
p value = 0.00003;
There is a significant difference
Confusion Matrix of XGBoost on Dataset 2:
[[  59   16]
 [  11 1156]]


z score = -4.74088;
p value = 0.00000;
There is a significant difference
Confusion Matrix of Random Forest on Dataset 1:
[[  33   42]
 [   6 1161]]


z score = -4.74088;
p value = 0.00000;
There is a significant difference
Confusion Matrix of Random Forest on Dataset 2:
[[  60   15]
 [  14 1153]]


z score = -7.61224;
p value = 0.00000;
There is a significant difference
Confusion Matrix of Logistic Regression on Dataset 1:
[[   0   75]
 [   2 1165]]


z score = -7.61224;
p value = 0.00000;
There is a significant difference
Confusion Matrix of Logistic Regression on Dataset 2:
[[  16   59]
 [  30 1137]]

_images/PanCancer_949_cellline_models_14_1.png

Gastric Carcinoma biomarkers¶

[67]:

STAVER_model, raw_data_model = load_data(target="Cancer_type",
                                        genes=["ERBB2","AGR3","AKR7A3","CTSE"],
                                        cancer_types=["Gastric Carcinoma"],
                                        )

outpath = "~/DIA-STAVER/949-cell-lines-1/model-evaluate/benchmark_models_based_on_Gastric_markers.pdf"
evaluator = ModelEvaluator(raw_data_model, STAVER_model,
                            target='Cancer_type',
                            k_fold=5,
                            random_state=42,
                            outpath=outpath
                            )
evaluator.train_models()
evaluator.plot_rocs()

XGBoost on Original data: AUC = 0.86 (0.82-0.89)
XGBoost on STAVER data: AUC = 0.95 (0.93-0.96)
z score = -5.33799;
p value = 0.00000;
There is a significant difference


Random Forest on Original data: AUC = 0.85 (0.82-0.88)
Random Forest on STAVER data: AUC = 0.94 (0.92-0.95)
z score = -4.22570;
p value = 0.00002;
There is a significant difference


Logistic Regression on Original data: AUC = 0.77 (0.73-0.80)
Logistic Regression on STAVER data: AUC = 0.93 (0.91-0.94)
z score = -9.25535;
p value = 0.00000;
There is a significant difference

_images/PanCancer_949_cellline_models_16_1.png

Colorectal Carcinoma biomarkers¶

[70]:

STAVER_model, raw_data_model = load_data(target="Cancer_type",
                                        genes=["MUC2", "EPCAM", "CDX2", "MUC13"],
                                        cancer_types=["Colorectal Carcinoma"])

outpath = "~/DIA-STAVER/949-cell-lines-1/model-evaluate/benchmark_models_based_on_CRC_markers.pdf"
evaluator = ModelEvaluator(raw_data_model, STAVER_model,
                            target='Cancer_type',
                            k_fold=5,
                            random_state=42,
                            outpath=outpath
                            )
evaluator.train_models()
evaluator.plot_rocs()

XGBoost on Original data: AUC = 0.85 (0.81-0.88)
XGBoost on STAVER data: AUC = 0.88 (0.86-0.91)
z score = -2.69639;
p value = 0.00701;
There is a significant difference


Random Forest on Original data: AUC = 0.84 (0.81-0.87)
Random Forest on STAVER data: AUC = 0.89 (0.86-0.92)
z score = -4.33362;
p value = 0.00001;
There is a significant difference


Logistic Regression on Original data: AUC = 0.84 (0.81-0.87)
Logistic Regression on STAVER data: AUC = 0.88 (0.86-0.91)
z score = -3.80086;
p value = 0.00014;
There is a significant difference

_images/PanCancer_949_cellline_models_18_1.png

NSCLC vs. SCLC (NSCLC and SCLC biomarkers)¶

[72]:

STAVER_model, raw_data_model = load_data(target="Cancer_type",
                                         genes=["KRT5", "DSC3", "CHGA", "NCAM1", "SYP", "GRP"],
                                         cancer_types=["Small Cell Lung Carcinoma", "Non-Small Cell Lung Carcinoma"])

outpath = "~/DIA-STAVER/949-cell-lines-1/model-evaluate/benchmark_models_based_on_lung_markers.pdf"

evaluator = ModelEvaluator(raw_data_model, STAVER_model,
                            target='Cancer_type',
                            k_fold=5,
                            random_state=42,
                            outpath=outpath
                            )
evaluator.train_models()
evaluator.plot_rocs()

XGBoost on Original data: AUC = 0.82 (0.78-0.87)
XGBoost on STAVER data: AUC = 0.91 (0.86-0.94)
z score = -2.71389;
p value = 0.00665;
There is a significant difference


Random Forest on Original data: AUC = 0.80 (0.74-0.85)
Random Forest on STAVER data: AUC = 0.95 (0.92-0.96)
z score = -5.21232;
p value = 0.00000;
There is a significant difference


Logistic Regression on Original data: AUC = 0.85 (0.81-0.89)
Logistic Regression on STAVER data: AUC = 0.94 (0.91-0.96)
z score = -3.96235;
p value = 0.00007;
There is a significant difference

_images/PanCancer_949_cellline_models_20_1.png

Glioma biomarkers¶

[ ]:

STAVER_model, raw_data_model = load_data(target="Cancer_type",
                                         genes=["GFAP","NES"],
                                         cancer_types=["Glioma"])

outpath = "~/DIA-STAVER/949-cell-lines-1/model-evaluate/benchmark_models_based_on_glioma_markers.pdf"
evaluator = ModelEvaluator(raw_data_model, STAVER_model,
                            target='Cancer_type',
                            k_fold=5,
                            random_state=42,
                            outpath=outpath,
                            )
evaluator.train_models()
evaluator.plot_rocs()

Pancreatic Carcinoma biomarkers¶

[77]:

STAVER_model, raw_data_model = load_data(target="Cancer_type",
                                         genes=["MUC1", "ANO1", "DHRS9", "S100P"],
                                         cancer_types=["Pancreatic Carcinoma"])

outpath = "~/DIA-STAVER/949-cell-lines-1/model-evaluate/benchmark_models_based_on_PDAC_markers.pdf"
evaluator = ModelEvaluator(raw_data_model, STAVER_model,
                            target='Cancer_type',
                            k_fold=5,
                            random_state=42,
                            outpath=outpath
                            )
evaluator.train_models()
evaluator.plot_rocs()

XGBoost on Original data: AUC = 0.80 (0.75-0.85)
XGBoost on STAVER data: AUC = 0.88 (0.84-0.92)
z score = -2.80131;
p value = 0.00509;
There is a significant difference


Random Forest on Original data: AUC = 0.79 (0.74-0.84)
Random Forest on STAVER data: AUC = 0.88 (0.83-0.91)
z score = -3.26500;
p value = 0.00109;
There is a significant difference


Logistic Regression on Original data: AUC = 0.76 (0.70-0.82)
Logistic Regression on STAVER data: AUC = 0.85 (0.80-0.89)
z score = -4.05580;
p value = 0.00005;
There is a significant difference

_images/PanCancer_949_cellline_models_24_1.png

kindney Carcinoma¶

[80]:

STAVER_model, raw_data_model = load_data(target="Cancer_type",
                                         genes=["CD70","CA9"],
                                         cancer_types=["Kidney Carcinoma"],
                                         )

outpath = "~DIA-STAVER/949-cell-lines-1/model-evaluate/benchmark_models_based_on_kindney_markers.pdf"
evaluator = ModelEvaluator(raw_data_model, STAVER_model,
                            target='Cancer_type',
                            k_fold=5,
                            random_state=42,
                            outpath=outpath
                            )
evaluator.train_models()
evaluator.plot_rocs()

XGBoost on Original data: AUC = 0.84 (0.80-0.88)
XGBoost on STAVER data: AUC = 0.87 (0.84-0.91)
z score = -1.63729;
p value = 0.10157;
There is NO significant difference


Random Forest on Original data: AUC = 0.83 (0.79-0.87)
Random Forest on STAVER data: AUC = 0.89 (0.85-0.91)
z score = -3.19578;
p value = 0.00139;
There is a significant difference


Logistic Regression on Original data: AUC = 0.83 (0.79-0.87)
Logistic Regression on STAVER data: AUC = 0.87 (0.83-0.90)
z score = -2.83414;
p value = 0.00459;
There is a significant difference

_images/PanCancer_949_cellline_models_26_1.png

[81]:

STAVER_model, raw_data_model = load_data(target="Cancer_type",
                                         genes=["CD70","CA9"],
                                         cancer_types=["Kidney Carcinoma"],
                                         )

outpath = "~/DIA-STAVER/949-cell-lines-1/model-evaluate/benchmark_models_based_on_kindney_markers_KF6.pdf"
evaluator = ModelEvaluator(raw_data_model, STAVER_model,
                            target='Cancer_type',
                            k_fold=6,
                            random_state=42,
                            outpath=outpath
                            )
evaluator.train_models()
evaluator.plot_rocs()

XGBoost on Original data: AUC = 0.83 (0.79-0.87)
XGBoost on STAVER data: AUC = 0.87 (0.83-0.90)
z score = -1.65995;
p value = 0.09692;
There is NO significant difference


Random Forest on Original data: AUC = 0.83 (0.79-0.87)
Random Forest on STAVER data: AUC = 0.88 (0.85-0.91)
z score = -2.84790;
p value = 0.00440;
There is a significant difference


Logistic Regression on Original data: AUC = 0.83 (0.79-0.88)
Logistic Regression on STAVER data: AUC = 0.86 (0.82-0.90)
z score = -2.61851;
p value = 0.00883;
There is a significant difference

_images/PanCancer_949_cellline_models_27_1.png

[82]:

STAVER_model, raw_data_model = load_data(target="Cancer_type",
                                         genes=["CD70","CA9"],
                                         cancer_types=["Kidney Carcinoma"],
                                         )

outpath = "~/DIA-STAVER/949-cell-lines-1/model-evaluate/benchmark_models_based_on_kindney_markers_KF7.pdf"
evaluator = ModelEvaluator(raw_data_model, STAVER_model,
                            target='Cancer_type',
                            k_fold=7,
                            random_state=42,
                            outpath=outpath
                            )
evaluator.train_models()
evaluator.plot_rocs()

XGBoost on Original data: AUC = 0.84 (0.80-0.88)
XGBoost on STAVER data: AUC = 0.88 (0.85-0.92)
z score = -3.13140;
p value = 0.00174;
There is a significant difference


Random Forest on Original data: AUC = 0.83 (0.79-0.87)
Random Forest on STAVER data: AUC = 0.88 (0.84-0.91)
z score = -2.73058;
p value = 0.00632;
There is a significant difference


Logistic Regression on Original data: AUC = 0.83 (0.78-0.87)
Logistic Regression on STAVER data: AUC = 0.86 (0.82-0.90)
z score = -2.52993;
p value = 0.01141;
There is a significant difference

_images/PanCancer_949_cellline_models_28_1.png

The summary of the benchmark models’ performance acorss 7 cancer types¶

[85]:

Benchmark_models_performance = pd.read_excel("~/DIA-STAVER/STAVER-nc-revised/Revised-Tables/Supplementary-Tables/Supplementary-Table4.xlsx", sheet_name="11.Benchmark_models_performance", index_col=0)
Benchmark_models_performance

[85]:

	Biomarkers	Benchmark Models	Original data	STAVER data	DeLong's Test p values
Cancer Type
Hepatocellular Carcinoma	GGT1, DCP, HSPA4, A1CF	XGBoost	AUC = 0.84 (0.79-0.89)	AUC = 0.96 (0.95-0.96)	0.00003
Hepatocellular Carcinoma	GGT1, DCP, HSPA4, A1CF	Random Forest	AUC = 0.86 (0.81-0.90)	AUC = 0.96 (0.95-0.96)	0.00001
Hepatocellular Carcinoma	GGT1, DCP, HSPA4, A1CF	Logistic Regression	AUC = 0.80 (0.77-0.84)	AUC = 0.92 (0.91-0.94)	0.00001
Gastric Carcinoma	ERBB2, AGR3, CTSE, AKR7A3	XGBoost	AUC = 0.86 (0.82-0.89)	AUC = 0.95 (0.93-0.96)	0.00001
Gastric Carcinoma	ERBB2, AGR3, CTSE, AKR7A3	Random Forest	AUC = 0.85 (0.82-0.88)	AUC = 0.94 (0.92-0.95)	0.00002
Gastric Carcinoma	ERBB2, AGR3, CTSE, AKR7A3	Logistic Regression	AUC = 0.77 (0.73-0.80)	AUC = 0.93 (0.91-0.94)	0.00001
Colorectal Carcinoma	MUC2, EPCAM, CDX2, MUC13	XGBoost	AUC = 0.85 (0.81-0.88)	AUC = 0.88 (0.86-0.91)	0.00701
Colorectal Carcinoma	MUC2, EPCAM, CDX2, MUC13	Random Forest	AUC = 0.84 (0.81-0.87)	AUC = 0.89 (0.86-0.92)	0.00001
Colorectal Carcinoma	MUC2, EPCAM, CDX2, MUC13	Logistic Regression	AUC = 0.84 (0.81-0.87)	AUC = 0.88 (0.86-0.91)	0.00014
NSCLC vs. SCLC	KRT5, DSC3, CHGA, NCAM1, SYP, GRP	XGBoost	AUC = 0.82 (0.78-0.87)	AUC = 0.91 (0.86-0.94)	0.00665
NSCLC vs. SCLC	KRT5, DSC3, CHGA, NCAM1, SYP, GRP	Random Forest	AUC = 0.80 (0.74-0.85)	AUC = 0.95 (0.92-0.96)	0.00001
NSCLC vs. SCLC	KRT5, DSC3, CHGA, NCAM1, SYP, GRP	Logistic Regression	AUC = 0.85 (0.81-0.89)	AUC = 0.94 (0.91-0.96)	0.00007
Glioma	GFAP, NES	XGBoost	AUC = 0.79 (0.74-0.85)	AUC = 0.87 (0.82-0.92)	0.03704
Glioma	GFAP, NES	Random Forest	AUC = 0.80 (0.75-0.86)	AUC = 0.91 (0.87-0.94)	0.00037
Glioma	GFAP, NES	Logistic Regression	AUC = 0.68 (0.61-0.75)	AUC = 0.73 (0.66-0.79)	0.04942
kindney Carcinoma	CD70, CA9	XGBoost	AUC = 0.84 (0.80-0.88)	AUC = 0.87 (0.84-0.91)	0.10157
kindney Carcinoma	CD70, CA9	Random Forest	AUC = 0.83 (0.79-0.87)	AUC = 0.88 (0.85-0.91)	0.00440
kindney Carcinoma	CD70, CA9	Logistic Regression	AUC = 0.83 (0.79-0.87)	AUC = 0.87 (0.83-0.90)	0.00459
Pancreatic Carcinoma	MUC1, ANO1, ADGRF1, DHRS9, S100P	XGBoost	AUC = 0.80 (0.75-0.85)	AUC = 0.88 (0.84-0.92)	0.00509
Pancreatic Carcinoma	MUC1, ANO1, ADGRF1, DHRS9, S100P	Random Forest	AUC = 0.79 (0.74-0.84)	AUC = 0.88 (0.83-0.91)	0.00109
Pancreatic Carcinoma	MUC1, ANO1, ADGRF1, DHRS9, S100P	Logistic Regression	AUC = 0.76 (0.70-0.82)	AUC = 0.85 (0.80-0.89)	0.00005

[ ]: