diff --git a/.Rproj.user/CE4CE23/pcs/workbench-pane.pper b/.Rproj.user/CE4CE23/pcs/workbench-pane.pper index 6860e6b..ab5e950 100644 --- a/.Rproj.user/CE4CE23/pcs/workbench-pane.pper +++ b/.Rproj.user/CE4CE23/pcs/workbench-pane.pper @@ -1,5 +1,5 @@ { "TabSet1": 3, - "TabSet2": 2, + "TabSet2": 0, "TabZoom": {} } \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..59fd2b3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata +data/ccleData.RData +data/Cell_line_RMA_proc_basalExp.txt +data/cgp2016ExprRma.RData diff --git a/inst/doc/vignetteOutline.R b/inst/doc/vignetteOutline.R deleted file mode 100644 index 921b24d..0000000 --- a/inst/doc/vignetteOutline.R +++ /dev/null @@ -1,206 +0,0 @@ -### R code from vignette source 'vignetteOutline.Snw' - -################################################### -### code chunk number 1: vignetteOutline.Snw:11-14 -################################################### -library(pRRophetic) -library(ggplot2) -set.seed(12345) - - -################################################### -### code chunk number 2: vignetteOutline.Snw:23-24 -################################################### -data("bortezomibData") #exprDataBortezomib, bortIndex, studyResponse and studyIndex - - -################################################### -### code chunk number 3: vignetteOutline.Snw:28-29 -################################################### -pRRopheticQQplot("Bortezomib") - - -################################################### -### code chunk number 4: vignetteOutline.Snw:34-36 -################################################### -cvOut <- pRRopheticCV("Bortezomib", cvFold=5, testExprData=exprDataBortezomib) -summary(cvOut) - - -################################################### -### code chunk number 5: vignetteOutline.Snw:40-41 -################################################### -plot(cvOut) - - -################################################### -### code chunk number 6: vignetteOutline.Snw:45-51 -################################################### -predictedPtype <- pRRopheticPredict(exprDataBortezomib, "Bortezomib", -selection=1) -predictedPtype_blood <- pRRopheticPredict(exprDataBortezomib, "Bortezomib", -"blood", selection=1) -predictedPtype_solid <- pRRopheticPredict(exprDataBortezomib, "Bortezomib", -"allSolidTumors", selection=1) - - -################################################### -### code chunk number 7: vignetteOutline.Snw:55-64 -################################################### -t.test(predictedPtype[((studyResponse == "PGx_Responder = NR") & bortIndex)], -predictedPtype[((studyResponse == "PGx_Responder = R") & bortIndex)], -alternative="greater") -t.test(predictedPtype_blood[((studyResponse == "PGx_Responder = NR") & bortIndex)], -predictedPtype_blood[((studyResponse == "PGx_Responder = R") & bortIndex)], -alternative="greater") -t.test(predictedPtype_solid[((studyResponse == "PGx_Responder = NR") & bortIndex)], -predictedPtype_solid[((studyResponse == "PGx_Responder = R") & bortIndex)], -alternative="greater") - - -################################################### -### code chunk number 8: vignetteOutline.Snw:68-70 -################################################### -allTissuesPpv <- getPPV(predResponders=predictedPtype[((studyResponse == "PGx_Responder = R") & bortIndex)], predNonResponders=predictedPtype[((studyResponse == "PGx_Responder = NR") & bortIndex)], drug="Bortezomib") -bloodPpv <- getPPV(predResponders=predictedPtype_blood[((studyResponse == "PGx_Responder = R") & bortIndex)], predNonResponders=predictedPtype_blood[((studyResponse == "PGx_Responder = NR") & bortIndex)], drug="Bortezomib", tissue="blood") - - -################################################### -### code chunk number 9: vignetteOutline.Snw:75-80 -################################################### -df <- stack(list(NR=predictedPtype_blood[((studyResponse == "PGx_Responder = NR") -& bortIndex)], R=predictedPtype_blood[((studyResponse == "PGx_Responder = R") & -bortIndex)])) -ggplot(data=df, aes(y=values, x=ind)) + geom_boxplot(alpha=.3, fill=c("#CC0033", "#006633")) + -theme_bw() + ylab("Predicted Bortezomib Sensitivity") + xlab("Clinical Response") - - -################################################### -### code chunk number 10: vignetteOutline.Snw:86-87 -################################################### -data(ccleData) #sensDataCcle, exprMatCcle - - -################################################### -### code chunk number 11: vignetteOutline.Snw:91-93 -################################################### -cvOut_pd <- pRRopheticCV("PD.0325901", cvFold=5, testExprData=exprMatCcle) -summary(cvOut_pd) - - -################################################### -### code chunk number 12: vignetteOutline.Snw:97-98 -################################################### -plot(cvOut_pd) - - -################################################### -### code chunk number 13: vignetteOutline.Snw:102-103 -################################################### -predictedPtype_ccle <- pRRopheticPredict(exprMatCcle, "PD.0325901", selection=1) - - -################################################### -### code chunk number 14: vignetteOutline.Snw:108-115 -################################################### -cellLinesWithCcleIc50s <- names(predictedPtype_ccle)[names(predictedPtype_ccle) %in% -sensDataCcle$CCLE.Cell.Line.Name] -predCcleOrd <- predictedPtype_ccle[names(predictedPtype_ccle)] -ccleActArea_pd <- -sensDataCcle$"ActArea"[sensDataCcle$Compound == "PD-0325901"] -names(ccleActArea_pd) <- sensDataCcle$"CCLE.Cell.Line.Name"[sensDataCcle$Compound == -"PD-0325901"] -ccleActAreaord <- ccleActArea_pd[cellLinesWithCcleIc50s] - - -################################################### -### code chunk number 15: vignetteOutline.Snw:119-121 -################################################### -cor.test(predictedPtype_ccle[cellLinesWithCcleIc50s], ccleActAreaord, -method="spearman") - - -################################################### -### code chunk number 16: vignetteOutline.Snw:125-130 -################################################### -df2 <- data.frame(predCcle=predictedPtype_ccle[cellLinesWithCcleIc50s], -actAreaCcle=ccleActAreaord) -ggplot(data=df2, aes(y=predCcle, x=actAreaCcle)) + geom_point(alpha=0.5) + -geom_smooth(method=lm) + theme_bw() + xlab("Measured Activity Area") + -ylab("Predicted Drug Sensitivity") - - -################################################### -### code chunk number 17: vignetteOutline.Snw:134-136 -################################################### -predictedPtype_ccle_erlotinib <- pRRopheticLogisticPredict(exprMatCcle, "Erlotinib", -selection=1) - - -################################################### -### code chunk number 18: vignetteOutline.Snw:140-148 -################################################### -cellLinesWithCcleIc50s <- -names(predictedPtype_ccle_erlotinib)[names(predictedPtype_ccle_erlotinib) %in% -sensDataCcle$CCLE.Cell.Line.Name] -predCcleOrd <- predictedPtype_ccle_erlotinib[names(predictedPtype_ccle_erlotinib)] -ccleActArea_pd <- sensDataCcle$"ActArea"[sensDataCcle$Compound == "Erlotinib"] -names(ccleActArea_pd) <- sensDataCcle$"CCLE.Cell.Line.Name"[sensDataCcle$Compound == -"Erlotinib"] -ccleActAreaord <- ccleActArea_pd[cellLinesWithCcleIc50s] - - -################################################### -### code chunk number 19: vignetteOutline.Snw:152-156 -################################################### -resistant <- names(sort(ccleActAreaord))[1:55] #55 highly resistant cell lines. -sensitive <- names(sort(ccleActAreaord, decreasing=TRUE))[1:15] #15 sensitive -t.test(predictedPtype_ccle_erlotinib[resistant], -predictedPtype_ccle_erlotinib[sensitive]) - - -################################################### -### code chunk number 20: vignetteOutline.Snw:160-163 -################################################### -boxplot(list(Resistant=predictedPtype_ccle_erlotinib[resistant], -Sensitive=predictedPtype_ccle_erlotinib[sensitive]), pch=20, -vertical=TRUE, method="jitter", ylab="Log-odds of sensitivity") - - -################################################### -### code chunk number 21: vignetteOutline.Snw:172-178 -################################################### -trainExpr <- exprDataBortezomib[, (detailedResponse %in% c(1,2,3,4,5)) & -studyIndex %in% c("studyCode = 25", "studyCode = 40")] -trainPtype <- detailedResponse[(detailedResponse %in% c(1,2,3,4,5)) & -studyIndex %in% c("studyCode = 25", "studyCode = 40")] -testExpr <- exprDataBortezomib[, (detailedResponse %in% c(1,2,3,4,5)) & -studyIndex %in% c("studyCode = 39")] - - -################################################### -### code chunk number 22: vignetteOutline.Snw:182-183 -################################################### -ptypeOut <- calcPhenotype(trainExpr, trainPtype, testExpr, selection=1) - - -################################################### -### code chunk number 23: vignetteOutline.Snw:187-190 -################################################### -testPtype <- detailedResponse[(detailedResponse %in% c(1,2,3,4,5)) & -studyIndex %in% c("studyCode = 39")] -cor.test(testPtype, ptypeOut, alternative="greater") - - -################################################### -### code chunk number 24: vignetteOutline.Snw:195-197 -################################################### -t.test(ptypeOut[testPtype %in% c(3,4,5)], ptypeOut[testPtype %in% c(1,2)], -alternative="greater") - - -################################################### -### code chunk number 25: vignetteOutline.Snw:200-201 -################################################### -sessionInfo() - - diff --git a/inst/doc/vignetteOutline.Snw b/inst/doc/vignetteOutline.Snw deleted file mode 100644 index 605df58..0000000 --- a/inst/doc/vignetteOutline.Snw +++ /dev/null @@ -1,209 +0,0 @@ -\documentclass[a4paper]{article} -\usepackage{placeins} -\title{pRRophetic User's guide} -\author{Dr. Paul Geeleher} -\begin{document} -\maketitle - -This file will demonstrate some use cases for the pRRophetic R packages. The package allows prediction of a phenotype from gene expression data. Here, we demonstrate the primary use case, which is prediction of clinical outcome using the Cancer Genome Project (CGP) cell line data. We also show that the package can be used for prediction of drug sensitivity in an external panel of cell lines (the Cancer Cell Line Encyclopedia (CCLE)) . Furthermore, we demonstrate that, at least in principle, the package can be used for prediction from clinical data (rather than just cell line data). - -First, load the library. -<<>>= -library(pRRophetic) -library(ggplot2) -set.seed(12345) -@ - - -\section{Predicting clinical outcome from the CGP cell lines} - -The primary use case of pRRophetic is predicting clinical outcome to chemotherapy, from baseline tumor gene expression data. This is achieved using the CGP cell lines as a training set and - -Load the bortezomib expression data. This is stored in a matrix with Gene Symbols as row names. Note that in order for "pRRopheticPredict" to work, the matrix MUST be annotated with official gene symbols. -<<>>= -data("bortezomibData") #exprDataBortezomib, bortIndex, studyResponse and studyIndex -@ - -Assess the normality of the data. For many of the drug in CGP, the data deviate wildly from a normal distribtion and should not be fit as the response variable in a linear model (i.e. pRRopheticPredict is based on linear ridge regression). -<>= -pRRopheticQQplot("Bortezomib") -@ - - -Perform 5-fold cross-validation on the training set (i.e. the CGP cell lines). This will give an indication of whether we may be able to predict clinical drug sensitivity. -<<>>= -cvOut <- pRRopheticCV("Bortezomib", cvFold=5, testExprData=exprDataBortezomib) -summary(cvOut) -@ - -Plot the cross validation predicted phenotype against the measured IC50s. -<>= -plot(cvOut) -@ - -Based on the qqplot it is likely acceptable to use these data for prediction of bortezomib sensitivity. Predict bortezomib sensitivity using all cell lines, then only cell lines from hematological cancers and then only cell lines from derived from solid tumors. -<<>>= -predictedPtype <- pRRopheticPredict(exprDataBortezomib, "Bortezomib", -selection=1) -predictedPtype_blood <- pRRopheticPredict(exprDataBortezomib, "Bortezomib", -"blood", selection=1) -predictedPtype_solid <- pRRopheticPredict(exprDataBortezomib, "Bortezomib", -"allSolidTumors", selection=1) -@ - -Compare these three types of models. Interestingly, models trained on only "blood" cancer cell lines perform best. -<<>>= -t.test(predictedPtype[((studyResponse == "PGx_Responder = NR") & bortIndex)], -predictedPtype[((studyResponse == "PGx_Responder = R") & bortIndex)], -alternative="greater") -t.test(predictedPtype_blood[((studyResponse == "PGx_Responder = NR") & bortIndex)], -predictedPtype_blood[((studyResponse == "PGx_Responder = R") & bortIndex)], -alternative="greater") -t.test(predictedPtype_solid[((studyResponse == "PGx_Responder = NR") & bortIndex)], -predictedPtype_solid[((studyResponse == "PGx_Responder = R") & bortIndex)], -alternative="greater") -@ - -Estimate the PPN and PPV values for the predictions above. A cutpoint is estimated from the mean IC50 value in the training data. -<<>>= -allTissuesPpv <- getPPV(predResponders=predictedPtype[((studyResponse == "PGx_Responder = R") & bortIndex)], predNonResponders=predictedPtype[((studyResponse == "PGx_Responder = NR") & bortIndex)], drug="Bortezomib") -bloodPpv <- getPPV(predResponders=predictedPtype_blood[((studyResponse == "PGx_Responder = R") & bortIndex)], predNonResponders=predictedPtype_blood[((studyResponse == "PGx_Responder = NR") & bortIndex)], drug="Bortezomib", tissue="blood") -@ - - -Make a boxplot of the results of the blood-only model. -<>= -df <- stack(list(NR=predictedPtype_blood[((studyResponse == "PGx_Responder = NR") -& bortIndex)], R=predictedPtype_blood[((studyResponse == "PGx_Responder = R") & -bortIndex)])) -ggplot(data=df, aes(y=values, x=ind)) + geom_boxplot(alpha=.3, fill=c("#CC0033", "#006633")) + -theme_bw() + ylab("Predicted Bortezomib Sensitivity") + xlab("Clinical Response") -@ - -\section{Predict drug sensitivity in CCLE. Demonstrate both linear and logistic models} - -Lets predict for PD0332991 using pRRopheticPredict(). First, load the CCLE expression and phenotype data. This loads two objects sensDataCcle and exprMatCcle. -<<>>= -data(ccleData) #sensDataCcle, exprMatCcle -@ - -Do 10 fold cross-validation on this drug. -<<>>= -cvOut_pd <- pRRopheticCV("PD.0325901", cvFold=5, testExprData=exprMatCcle) -summary(cvOut_pd) -@ - -Plot the cross-validation predicted phenotype against the measured phenotype. -<>= -plot(cvOut_pd) -@ - -Run the prediction for PD0325901 on the CCLE data. -<<>>= -predictedPtype_ccle <- pRRopheticPredict(exprMatCcle, "PD.0325901", selection=1) -@ - - -Get the ActArea (a measure of drug sensitivity) for the CCLE cell lines for which we have just predicted IC50. We will not use IC50 values as they have been capped at the maximum drug screening concentration in CCLE. -<<>>= -cellLinesWithCcleIc50s <- names(predictedPtype_ccle)[names(predictedPtype_ccle) %in% -sensDataCcle$CCLE.Cell.Line.Name] -predCcleOrd <- predictedPtype_ccle[names(predictedPtype_ccle)] -ccleActArea_pd <- -sensDataCcle$"ActArea"[sensDataCcle$Compound == "PD-0325901"] -names(ccleActArea_pd) <- sensDataCcle$"CCLE.Cell.Line.Name"[sensDataCcle$Compound == -"PD-0325901"] -ccleActAreaord <- ccleActArea_pd[cellLinesWithCcleIc50s] -@ - -Compare prediction to measured IC50, it is actually higher than the correlation achieved for remeasuring the drug sensitivity. -<<>>= -cor.test(predictedPtype_ccle[cellLinesWithCcleIc50s], ccleActAreaord, -method="spearman") -@ - -Plot the resulting correlation between predicted and measured values. -<>= -df2 <- data.frame(predCcle=predictedPtype_ccle[cellLinesWithCcleIc50s], -actAreaCcle=ccleActAreaord) -ggplot(data=df2, aes(y=predCcle, x=actAreaCcle)) + geom_point(alpha=0.5) + -geom_smooth(method=lm) + theme_bw() + xlab("Measured Activity Area") + -ylab("Predicted Drug Sensitivity") -@ - -Next, do some prediction for Erlotinib, a targeted agent. To demonstrate how to create a logistic model, lets predict for erlotinib using pRRopheticLogisiticPredict(). This function will return the log odds of sensitivity -<<>>= -predictedPtype_ccle_erlotinib <- pRRopheticLogisticPredict(exprMatCcle, "Erlotinib", -selection=1) -@ - -Get the ActArea for the CCLE cell lines for which we have just predicted IC50. -<<>>= -cellLinesWithCcleIc50s <- -names(predictedPtype_ccle_erlotinib)[names(predictedPtype_ccle_erlotinib) %in% -sensDataCcle$CCLE.Cell.Line.Name] -predCcleOrd <- predictedPtype_ccle_erlotinib[names(predictedPtype_ccle_erlotinib)] -ccleActArea_pd <- sensDataCcle$"ActArea"[sensDataCcle$Compound == "Erlotinib"] -names(ccleActArea_pd) <- sensDataCcle$"CCLE.Cell.Line.Name"[sensDataCcle$Compound == -"Erlotinib"] -ccleActAreaord <- ccleActArea_pd[cellLinesWithCcleIc50s] -@ - -There are a very large number of cell lines resistant to Erlotinib (within the drug screening window), so a correlation is not an appropriate measure of concordance. So lets do a t-test between some of the most sensitive and resistant cell lines to assess whether signal is being captured by the predictions. -<<>>= -resistant <- names(sort(ccleActAreaord))[1:55] #55 highly resistant cell lines. -sensitive <- names(sort(ccleActAreaord, decreasing=TRUE))[1:15] #15 sensitive -t.test(predictedPtype_ccle_erlotinib[resistant], -predictedPtype_ccle_erlotinib[sensitive]) -@ - -Despite the fact that IC50 values are not correlated for this drug between these studies, the most sensitive/resistant samples are separated highly significantly with this logistic models. -<<>>= -boxplot(list(Resistant=predictedPtype_ccle_erlotinib[resistant], -Sensitive=predictedPtype_ccle_erlotinib[sensitive]), pch=20, -vertical=TRUE, method="jitter", ylab="Log-odds of sensitivity") -@ - - -\section{Clinical drug-sensitivity prediction from clinical data} - -Include, is an example, prediction from the bortezomib clinical data where we try to predict CR, PR, MR, NC, PD from CR, PR, MR, NC, PD. This serves as both an example of prediction directly from clinical data and of using a dataset other than the CGP from which to predict. - -First, prepare the training data and test expression data. -<<>>= -trainExpr <- exprDataBortezomib[, (detailedResponse %in% c(1,2,3,4,5)) & -studyIndex %in% c("studyCode = 25", "studyCode = 40")] -trainPtype <- detailedResponse[(detailedResponse %in% c(1,2,3,4,5)) & -studyIndex %in% c("studyCode = 25", "studyCode = 40")] -testExpr <- exprDataBortezomib[, (detailedResponse %in% c(1,2,3,4,5)) & -studyIndex %in% c("studyCode = 39")] -@ - -Calculate the predicted phenotype. -<<>>= -ptypeOut <- calcPhenotype(trainExpr, trainPtype, testExpr, selection=1) -@ - -We do capture some signal. -<<>>= -testPtype <- detailedResponse[(detailedResponse %in% c(1,2,3,4,5)) & -studyIndex %in% c("studyCode = 39")] -cor.test(testPtype, ptypeOut, alternative="greater") -@ - - -This t-test allows us to compare results directly to the cell line model, however, the cell line model outperforms this particular clinical model. -<<>>= -t.test(ptypeOut[testPtype %in% c(3,4,5)], ptypeOut[testPtype %in% c(1,2)], -alternative="greater") -@ - -<<>>= -sessionInfo() -@ - -\end{document} - - - - - diff --git a/inst/doc/vignetteOutline.pdf b/inst/doc/vignetteOutline.pdf deleted file mode 100644 index b783e04..0000000 Binary files a/inst/doc/vignetteOutline.pdf and /dev/null differ