############################################################################## # # This is R code for drawing cost plots using several distribution for costs # and several other features for evaluating classifiers. # It calculates: # - AUC* (area under the ROC curve), and GINI* # - AUCH (area under the convex hull of the ROC curve) # - MSE (Brier score), logloss, KS*. # - Refinement (Brier score decomposition) using the ROC convex hull # It draws: # - the kernel smoothed score distributions of the two classes* # - the ROC curve and convex hull* # - a plot of the minimum loss produced for each value of c* # - the weight function implicitly used by the AUC, as a function of score* # - the weight function implicitly used by the AUC, as a function of c* # - the weight function used by the AUC measure* # - cost plots, using a uniform distribution for costs (Beta(1,1) # - cost plots, using a Beta distribution for costs (Beta(2,2) # - cost plots, using a uniform distribution for skews (Beta(1,1) # - cost plots, using a Beta distribution for skews (Beta(2,2) # - on these plots, we can show: # - black: the minimum3 cost curve (refinement) # - pink: the Brier cost curves (using the scores as thresholds) # - orange: the cont-unif cost curves # - red the disc-unif cost curves # # This code has been developed by # JOSE HERNANDEZ-ORALLO, UNIVERSITAT POLITECNICA DE VALENCIA, SPAIN # jorallo@dsic.upv.es # reusing code (especially those features marked with a '*' above) by: # DAVID J. HAND, DEPARTMENT OF MATHEMATICS, IMPERIAL COLLEGE, LONDON # d.j.hand@imperial.ac.uk # and using ideas from: # PETER FLACH, UNIVERSITY OF BRISTOL # CESAR FERRI, UNIVERSITAT POLITECNICA DE VALENCIA # ############################################################################## ############################################### ############################################### ############### OUTPUT OPTIONS ################ ############################################### ####### WORKING DIRECTORY WORKDIR <- "C:/Users/instalar/Documents/R" setwd(WORKDIR) ####### OUTPUT DRAWING OPTIONS PLOTS_PER_PAGE <- 2 # 0 # if 0 it doesn't reset the page PLOTS_PER_PAGE_VERTICAL <- FALSE # especially when plotting two or three figures in a column PDF <- TRUE # Generate PDF PDFheight= 4 # 7 by default, so 14 makes it double higher than wide, 5 makes letters bigger (in proportion) for just one plot PDFwidth= 12 # 7 by default # 5 x 4 for 1 # 12 x 4 for 3 (ICML) in vertical (with PLOTS_PER_PAGE_VERTICAL TRUE) # 6 x 6 for 1 (ICML) # One plot one picture! # 12 x 12 for 6 (ICML) # 4 x 12 for 2 in wide (ICML) # 4 x 12 for 3 in wide PNG <- FALSE # Generate PNG (if TRUE, the PDF is not generated) PNGres <- 1000 # Resolution as a factor of PDFheight and PDFweight MARGINSNOTITLE <- c(5,4,1,1) # Only used when we set NOTITLE true NOTITLE <- TRUE # Sets main="" and changes margin par(mar=MARGINSNOTITLE) par(mar=c(5.1,4.1,4.1,2.1)) # default margins: par()$mar CLOSEPDF <- TRUE SET_YLIM <- TRUE # Set a fixed height (y-axis) of the plots # NOT USED YLIM <- 0.4 #1 # 0.5 #1 #0.8 # Height (y-axis) (originally if SET_YLIM is TRUE but now it is always considered) DRAW_CLASSIFIER <- FALSE # densities DRAW_ROC_CURVE <- TRUE TYPE_ROC_CURVE <- "o" # "o" line and circle/point/cross, "l" only lines, "o" only the circle point PCH_ROC_CURVE <- 4 # NOT APPLICABLE. # NA_integer_ # 15 # NA_integer_: nothing, 1: empty circle, 2: empty triangle, 15: solid square COLOUR_ROC_CURVE <- "black" # "orange" LWD_ROC_CURVE <- 2 # 1: thin, 2:thick LTY_ROC_CURVE <- 1 # lty=1 (continuous), lty=2 (dashed) # PCH_ROC_CURVE <- 0 # 0: none. LWD_ROC_HULL <- 2 # 1: thin, 2:thick LTY_ROC_HULL <- 2 # lty=1 (continuous), lty=2 (dashed) PCH_ROC_HULL <- 15 # 1: empty circle, 2: empty triangle, 15: solid square # IT DOES NOT WORK COLOR_ROC_DIAGONAL <- "grey" LWD_ROC_DIAGONAL <- 1 # 1: thin, 2:thick # Names of x-axis and y-axis COSTRATIO <- "cost" SKEW <- "skew" LOSS <- "loss" # Names for titles EXPECTEDLOSS <- "loss" # "minimum loss" # "expected loss" # "loss" # minimum expected loss BYCOST <- "by cost" BYSKEW <- "by skew" BETA22 <- "(Beta(2,2))" DRAW_OPTIMAL_CURVES <- TRUE # cost curves LTY_OPTIMAL_CURVES <- 2 # lty=1 (continuous), lty=2 (dashed) PCH_OPTIMAL_CURVES <- 15 # NA_integer_ # 15 # NA_integer_: nothing, 1: empty circle, 2: empty triangle, 15: solid square LWD_OPTIMAL_CURVES <- 2 # 1: thin, 2:thick COLOUR_OPTIMAL_CURVES <- "black" # "darkgreen" # "brown" # "black" # DRAW_OPTIMAL_CURVES_AGAIN <- TRUE # cost curves (second time and stronger) COLOUR_OPTIMAL_CURVE_AGAIN <- "black" # "darkgreen" # "brown" # "black" # LWD_OPTIMAL_CURVE_AGAIN <- 2 # 1: thin, 2:thick DRAW_ORANGE <- FALSE # ORANGE uniform line. DRAW_BETA_CURVES <- FALSE # RESOLUTION_EMPIRICAL_CURVES <- 300 #1000 #300 NATTHRESFUNZFACTOR <- FALSE NATTHRESFUNZACCUMULATE <- TRUE # if FALSE & FALSE, it is very asymmetric. Wrong # if TRUE & FALSE, it matches the way old Brier curve were made but gives wrong area # if FALSE & TRUE, it gives correct results (1/12, 7/12) COSTLINESREP <- TRUE # TRUE: Does not eliminate repeated lines. Calculates the red line correctly. # FALSE: Reduces the number of lines (eliminates repeated ones), but it computes the red line in a wrong way if there are ties. DRAW_COST_LINES <- FALSE # Cost lines DRAW_COST_LINES_AGAIN <- TRUE # Cost lines COLOUR_COST_LINES <- "grey" # "blue" # "darkblue" # "darkmagenta" DRAW_LOSSLINE <- FALSE # Average of cost lines (LOSS LINE, red line) COLOUR_LOSSLINE <- "red" LWD_LOSSLINE <- 2 # 1: thin, 2:thick LTY_LOSSLINE <- 1 # lty=1 (continuous), lty=2 (dashed) DRAW_COSTSQUARE <- TRUE # The square (grey) BRIERCURVES_WITH_FACTOR <- FALSE # TRUE: The old (wrong) way for skews. FALSE: the right way. DRAW_PROB_CURVES <- TRUE # Brier curves LWD_PROB_CURVES <- 2 # 1: thin, 2:thick PCH_PROB_CURVES <- 1 # 3 +, 4 x, NA_integer_ # 15 # NA_integer_: nothing, 1: empty circle, 2: empty triangle, 15: solid square TYPE_PROB_CURVES <- "o" # "o" line and circle, "l" only lines, "o" only the circle point DRAW_PROB_CURVES_AGAIN <- FALSE # Brier curves (again) COLOUR_PROB_CURVE_AGAIN <- "brown" # "darkmagenta" # "black" # "darkblue" # "darkmagenta" LWD_PROB_CURVE_AGAIN <- 2 # 1: thin, 2:thick DRAW_PROB_CURVES_PERCLASS <- TRUE # Draw prob (Brier) curves (positive and negative) separately. PLOT_SPACE <- TRUE # Must be true except when putting several curves together on the same graph PLOT_ROCSPACE <- TRUE # Same for ROC space OVERLAP_PROB_OPTIMAL <- TRUE # Overlap Brier curves and cost curves on the same plot. If no Brier curves, this should be ENABLED. OVERLAP_NONOPTIMAL <- TRUE # Overlap previous plot with cost lines DRAW_COSTRATIO_PLOTS <- TRUE DRAW_SKEW_PLOTS <- FALSE ALPHA_PLOT <- 2 # 0.01 (logloss?) BETA_PLOT <- 2 # 0.01 (logloss?) ############# END OUTPUT OPTIONS ############## ############################################### ############################################### ############################################### ############# OPERATING OPTIONS ############### ############################################### bfactor <- 2 # 1 is like Hand, and Drummond and Holte, but 2 makes more sense, since it places the maximum expected loss at 1 (and not 0.5 for a balanced dataset). ############# OPERATING OPTIONS ############### ############################################### ############################################### ############################################### ############## INPUT SAMPLES ################## ############################################### ########## INPUT FORMAT # data is in a matrix called ‘inp’ with two columns # column 1: classes, labelled 0 or 1 # column 2: classifier scores (preferrable between 0 and 1 if we want to draw Brier curves) inp <- cbind(c(1,1,0,0,0), c(0.9,0.7,0.7,0.2,0.1)) ############# END INPUT SAMPLES ############### ############################################### ############################################### ############################################### ################### INPUT ##################### ############################################### inp <- cbind(c(1,1,0,0,0,1,0,0,0,0,0,0,0,1,0),c(0.95,0.9,0.9,0.85,0.7,0.7,0.7, 0.55,0.45,0.2, 0.2, 0.18, 0.16 ,0.15,0.05)) #perfect calibration 5 (7/4) #inp <- inp74 #inp <- inpsmall ################ END INPUT #################### ############################################### ############################################### ############################################### ################ FUNCTIONS #################### ############################################### ############ Prob ThresFun(GENERIC) ProbThresFun <- function(inp, s) { t <- s t } ############ Prob ThresFunC(for costs) ProbThresFunC <- function(inp, c) { n0n1 <- nrow(inp) pi0 <- sum(inp[,1]==0) / n0n1 pi1 <- sum(inp[,1]==1) / n0n1 factorzc <- pi0 / ((1-c)*(1-pi0)+ c * pi0) t <- ProbThresFun(inp, c) # original # t <- ProbThresFun(inp, c) * factorzc # radical t } ############ Prob ThresFunZ(for skews) ProbThresFunZ <- function(inp, z) { n0n1 <- nrow(inp) pi0 <- sum(inp[,1]==0) / n0n1 pi1 <- sum(inp[,1]==1) / n0n1 t <- z # Theoretical, as in the papers. And also radical factor <- pi1/ ((1-z)*pi0 + (z)*(pi1)) #t <- ProbThresFun(inp, z*factor) # t <- ProbThresFun(inp, z) * factor # equivalent to above. The good ones for the old curves. # t <- ProbThresFun(inp, z) / factor # factorzc <- pi0 / ((1-z)*(1-pi0)+ z * pi0) # t <- ProbThresFun(inp, z) * factorzc # t <- ProbThresFun(inp, z) / factorzc t } ############ Calculate Loss (for skews) given a classifier and a threshold LossZ <- function(inp, t, z) { lo0 <- 0 lo1 <- 0 n0n1 <- nrow(inp) pi0 <- sum(inp[,1]==0) / n0n1 pi1 <- sum(inp[,1]==1) / n0n1 for (i in 1:n0n1) { if (inp[i,2] < t) { # if (inp[i,2] <= t) { pred <- 0 } else pred <- 1 if (pred != inp[i,1]) { if (pred == 1) { l0 <- z # 1 for 0-1 loss lo0 <- lo0 + l0 } else { l1 <- (1 - z) # 1 for 0-1 loss lo1 <- lo1 + l1 } } } lo0 <- bfactor * 0.5 *lo0 / pi0 lo1 <- bfactor * 0.5 *lo1 / pi1 (lo0 + lo1 ) / n0n1 } ############ Calculate Loss (for costs) given a classifier and a threshold LossC <- function(inp, t, c) { lo0 <- 0 lo1 <- 0 n0n1 <- nrow(inp) pi0 <- sum(inp[,1]==0) / n0n1 pi1 <- sum(inp[,1]==1) / n0n1 for (i in 1:n0n1) { if (inp[i,2] < t) { # if (inp[i,2] <= t) { pred <- 0 } else pred <- 1 if (pred != inp[i,1]) { if (pred == 1) { l0 <- c # 1 for 0-1 loss lo0 <- lo0 + l0 } else { l1 <- (1 - c) # 1 for 0-1 loss lo1 <- lo1 + l1 } } } lo0 <- bfactor * lo0 lo1 <- bfactor * lo1 (lo0 + lo1 ) / n0n1 } ############ Calculate ExpLoss (for skews) ExpLossZ <- function(inp, ThresFun) { ELoss <- 0 resS <- RESOLUTION_EMPIRICAL_CURVES range <- (0:resS) / resS for (z in range) { t <- ThresFun(inp, z) ELoss <- ELoss + LossZ(inp, t, z) } ELoss / (resS + 1) } # # Example of use: # # ExpLossZ(inp,ProbThresFunZ) # ExpLossZ(inp,NatThresFunZ) ############ Calculate ExpLoss (for costs) ExpLossC <- function(inp, ThresFun) { ELoss <- 0 resS <- RESOLUTION_EMPIRICAL_CURVES range <- (0:resS) / resS for (c in range) { t <- ThresFun(inp, c) ELoss <- ELoss + LossC(inp, t, c) } ELoss / (resS + 1) } # # Example of use: # # ExpLossC(inp,ProbThresFunC) # ExpLossC(inp,NatThresFunC) ############ Calculate an Empirical Loss Curve (for skews) LossCurveZ <- function(inp, ThresFun) { resS <- RESOLUTION_EMPIRICAL_CURVES range <- (0:resS) / resS ELoss <- 1:(resS+1) for (z in range) { t <- ThresFun(inp, z) ELoss[round(z*resS+1)] <- LossZ(inp, t, z) } ELoss } # # Example of use: # # LossCurveZ(inp,ProbThresFunZ) # LossCurveZ(inp,NatThresFunZ) ############ Calculate an Empirical Loss Curve (for costs) LossCurveC <- function(inp, ThresFun) { resS <- RESOLUTION_EMPIRICAL_CURVES range <- (0:resS) / resS ELoss <- 1:(resS+1) for (c in range) { t <- ThresFun(inp, c) ELoss[round(c*resS+1)] <- LossC(inp, t, c) } ELoss } # # Example of use: # # LossCurveC(inp,ProbThresFunC) # LossCurveC(inp,NatThresFunC) ############ Calculate MSE (Brier score) MSE <- function(inp) { MSE <- 0 for (i in 1:nrow(inp)) { MSE = MSE + (inp[i,1] - inp[i,2])^2 } MSE / nrow(inp) } ############ Calculate wMSE which depends on the skew wMSE <- function(inp, pi0, pi1) { MSE0 <- 0 MSE1 <- 0 for (i in 1:nrow(inp)) { p <- inp[i,2] pp <- p # pp <- p*(pi0) /((p*(pi0)+(1-p)*(pi1))) # pp <- p*(pi1) /((p*(pi1)+(1-p)*(pi0))) # pp <- p*(pi1)+(1-p)*(pi0) # pp <- p*(pi0)+(1-p)*(pi1) # pp <- p*(pi0) /((p*(pi1)+(1-p)*(pi0))) # pp <- p*(pi1) /((p*(pi0)+(1-p)*(pi1))) if (inp[i,1] == 1) { MSE1 = MSE1 + (inp[i,1] - pp)^2 } else { MSE0 = MSE0 + (inp[i,1] - pp)^2 } } MSE0 <- MSE0 / (nrow(inp)*pi0) MSE1 <- MSE1 / (nrow(inp)*pi1) # p <- MSE0 # MSE0 <- p*(pi0) /((p*(pi0)+(1-p)*(pi1))) # MSE0 <- p*(pi1) /((p*(pi1)+(1-p)*(pi0))) # p <- MSE1 # MSE1 <- p*(pi1) /((p*(pi1)+(1-p)*(pi0))) # MSE1 <- p*(pi0) /((p*(pi0)+(1-p)*(pi1))) wMSE <- 0.5 * (MSE0 + MSE1) # wMSE <- 2*(MSE0*0.5 + MSE1*0.5) / nrow(inp) # wMSE <- 2*(MSE0*pi1 + MSE1*pi0) / nrow(inp) # wMSE <- 2*(MSE0*pi0 + MSE1*pi1) / nrow(inp) # wMSE <- 2*(MSE0*pi1/(pi0*nrow(inp)) + MSE1*pi0/(pi1*nrow(inp))) # p <- wMSE # wMSE <- p*(pi1) /((p*(pi1)+(1-p)*(pi0))) # wMSE <- p*(pi0) /((p*(pi0)+(1-p)*(pi1))) wMSE # AUCCprobnewnorm*2 } ############# END FUNCTIONS ################### ############################################### ############################################### ############################################### #################### MAIN ##################### ############################################### ############ CONFIGURING OUTPUT ############### if (PDF) pdf("classifier.pdf", height= PDFheight, width= PDFwidth) if (PNG) png("classifier.png", height= PDFheight*PNGres, width= PDFwidth*PNGres) if (PLOTS_PER_PAGE == 6) par(mfrow=c(3,2)) if (PLOTS_PER_PAGE == 4) { if (PLOTS_PER_PAGE_VERTICAL) { par(mfrow=c(4,1)) } else par(mfrow=c(2,2)) } if (PLOTS_PER_PAGE == 3) { if (PLOTS_PER_PAGE_VERTICAL) { par(mfrow=c(3,1)) } else par(mfrow=c(1,3)) } if (PLOTS_PER_PAGE == 2) { if (PLOTS_PER_PAGE_VERTICAL) { par(mfrow=c(2,1)) } else par(mfrow=c(1,2)) } if (PLOTS_PER_PAGE == 1) par(mfrow=c(1,1)) ###############################################3 ############################################### #### SOME GENERAL VARIABLES USED ELSEWHERE #### n0n1 <- nrow(inp) x <- t(inp) zord <- order(x[2,]) sc <- x[,zord] n1 <- sum(sc[1,]) n0 <- n0n1 - n1 pi0 <- n0/n0n1 pi1 <- n1/n0n1 # We modify inp to be ordered by scores (this is needed for several plots) # order data into increasing scores zord <- order(x[2,]) zordrev <- rev(zord) screv <- x[,zordrev] inp <- t(screv) #Decreasing order if (n0 == 0) print("No elements of class 0") if (n1 == 0) print("No elements of class 0") # We calculate means class0 <- x[,x[1,]==0] class1 <- x[,x[1,]==1] if (n0 > 1) { s0 <- mean(class0[2,]) } else s0 <- class0[2] if (n1 > 1) { s1 <- mean(class1[2,]) } else s1 <- class1[2] calpi1 <- pi0*s0 + pi1*s1 # alpha and betad are the parameters in the beta # cost distribution ~ c^alpha * (1-c)^betad alpha <- 2 betad <- 2 ############################################### ############################################### ############## PLOTS CLASSIFIER ############### # Shows a classifier (densities). # Assumes scores are reversely ordered and no ties xax <- rev(inp[,2]) # reverses yax <- rev(inp[,1]) # reverses if (DRAW_CLASSIFIER) { if ((xax <= 1) && (xax >= 0)) { tit <- "Classifier" if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(xax,yax, xlim=c(0,1), main= tit,xlab= "Scores",ylab= "Actual Class") } if (!((xax <= 1) && (xax >= 0))) { tit <- "Classifier" if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } # plot(xax,yax, main= tit,xlab= "Scores",ylab= "Actual Class") } } ############################################### ############################################### ######### PLOTS SMOOTHED HISTOGRAM ############ # Smoothed histograms xmin <- min(x[2,]) xmax <- max(x[2,]) #plot(density(class0[2,]),xlim=c(xmin,xmax),main= "Kernel smoothed score distributions ",xlab= "Score ") #lines(density(class1[2,]),lty=4) if (DRAW_CLASSIFIER) { # Smoothed normalised histograms (to be plot over the classifier's plot) if (n0<=1) print("Density graph cannot be plot with n0 <= 1") den0 <- density(class0[2,]) # it doesn't work if n0 <= 1 if (n0<=1) print("Density graph cannot be plot with n1 <= 1") den1 <- density(class1[2,]) # it doesn't work if n1 <= 1 ymax <- max(max(den0[[2]]),max(den1[[2]])) den0normy <- den0[[2]]/ymax den1normy <- den1[[2]]/ymax lines(den0[[1]], den0normy, lty=3, col= "darkred") lines(den1[[1]], 1-den1normy, lty=4, col= "green") } ############################################### ############################################### ##### ACCUMULATED EMPIRICAL DIST. F0, F1 ###### # Calculate the raw ROC, replacing any tied # sequences by a ‘diagonal’ in the ROC curve. # The raw ROC starts at F0[1]=0, F1[1]=0, and ends at # F0[K1]=n0, F1[K1]=n1. # F0 and F1 are counts, G0 and G1 are normalised in (0,1) # F0 and F1 eliminate merge repeated values. # F0rep and F1rep keep the repeated values as different elements in the vector. # K1 is the number of non-repeated scores. It will be used everywhere sc <- cbind(sc,sc[,n0n1]) # Adds the last example. Now sc is one element longer. F0 <- c(0:n0n1) F1 <- c(0:n0n1) K1 <- 1 k <- 2 for (i in 1:n0n1) { F0[k] <- F0[K1]+(1-sc[1,i]) F1[k] <- F1[K1]+sc[1,i] K1 <- k k <- if (sc[2,i+1] == sc[2,i]) (k) else (k+1) } F0 <- F0[1:K1] F1 <- F1[1:K1] G0nomin <- F0 / n0 G1nomin <- F1 / n1 scshared <- sc[1,] j<- 1 Ac <- sc[1,1] for (i in 2:n0n1) { if (sc[2,i] == sc[2,i-1]) { j <- j +1 Ac <- Ac + sc[1,i] } else { for (k in (i-1):(i-j)) { scshared[k] <- Ac / j } j <- 1 Ac <- sc[1,i] } } F0rep <- c(0:n0n1) F1rep <- c(0:n0n1) K1rep <- 1 krep <- 2 for (i in 1:n0n1) { F0rep[krep] <- F0rep[K1rep]+(1-scshared[i]) F1rep[krep] <- F1rep[K1rep]+scshared[i] K1rep <- krep krep <- krep+1 } F0rep <- F0rep[1:K1rep] F1rep <- F1rep[1:K1rep] G0nominrep <- F0rep / n0 G1nominrep <- F1rep / n1 ############################################### ############################################### ######## CONVEX HULL - MINIMUM COSTS ########## # Find the upper concave hull # G0 and G1 are normalised in (0,1) and optimal # Repeated values are merged # hc will be the number of segments in the convex hull G0 <- c(0:(K1-1)) G1 <- c(0:(K1-1)) i <- 1 hc <- 1 while (i < K1) { c1 <- c((i+1):K1) for (j in (i+1):K1) { u1 <- (F1[j]-F1[i]) u0 <- (F0[j]-F0[i]) c1[j] <- u1/(u1+u0) } argmin <- i+1 c1min <- c1[i+1] for (k in (i+1):K1) { argmin <- if (c1[k] <= c1min) (k) else (argmin) c1min <- c1[argmin] } hc <- hc+1 G0[hc] <- F0[argmin] G1[hc] <- F1[argmin] i <- argmin } G0 <- G0[1:hc]/n0 G1 <- G1[1:hc]/n1 ############################################### ############################################### ######## PLOTS ROC CURVE AND HULL ############# if (DRAW_ROC_CURVE) { # Plot the ROC tit <- "ROC curve and convex hull" if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } if (PLOT_ROCSPACE) { plot(c(0,1),c(0,1), xlab= "F1 ",ylab= "F0 ",main= tit, type= "l" ) } lines(c(0,1),c(0,1),type= "l", col=COLOR_ROC_DIAGONAL, lwd= LWD_ROC_DIAGONAL) # diagonal lines(F1/n1,F0/n0, type= TYPE_ROC_CURVE, lwd=LWD_ROC_CURVE, lty= LTY_ROC_CURVE, pch= PCH_ROC_CURVE, col = COLOUR_ROC_CURVE) # Draw hull lines(G1,G0,type= "l",lty=LTY_ROC_HULL, lwd=LWD_ROC_HULL, pch= PCH_ROC_HULL, col <- COLOUR_ROC_CURVE) } # Plot the AUC mixture weight function in terms of the score # plot(density(x[2,]),lty=1,xlab= "Score ",main= " AUC measure weight function of T", ylab= "W(t)") # Plot the AUC mixture weight function in terms of the cost # aucd <- c((n0*G0 + n1*G1),1) # aucd2 <- c(1, (n0*G0 + n1*G1)) # aucf <- (aucd-aucd2)/n0n1 # plot(cost[2:hc],aucf[2:hc],type= "h", xlim=c(0,1), ylim=c(0,1),main= "AUC measure weight function of c",xlab= "Cost",ylab= "w(c)") # Plot the BETA weight function in terms of the cost # b <- c(1:100)/100 # y <- dbeta(b,alpha,betad) # plot(b,y,type= "l",xlab= COSTRATIO, main= "H measure weight function of c",ylab= "w(c) ") ############################################### ############################################### ########## AUC, AUCH, KS statistic ############ # Calculate the area under the ROC curve, AUC K11 <- K1+1 F0[K11] <- n0 F1[K11] <- n1 F0 <- F0[1:K11] F1 <- F1[1:K11] F0A <- F0[2:K11] F0B <- F0[1:K1] F1A <- F1[2:K11] F1B <- F1[1:K1] AUC <- sum((F0A-F0B)*(n1-(F1A+F1B)/2))/(n0*n1) Gini <- 2*AUC - 1 # Calculate the area under the CONVEX HULL, AUCH AUCH <- 0 for (i in 1:(hc-1)) { AUCH <- AUCH + G0[i]*(G1[i+1]-G1[i]) + 0.5*(G0[i+1]-G0[i])*(G1[i+1]-G1[i]) } # Compute KS statistic KS <- max((F0/n0) - (F1/n1)) ############################################### ############################################### ##### Calculate cost ratio array (x-axis) ##### # Legend # x-axis is for minimal costs, cost ratios and using uniform distribution unless specified # XXXXXnorm: skews # XXXXXnomin: no minimal costs (ROC curve) # XXXXXBeta22: using the Beta(2,2) distribution # XXXXXProb: Brier thresholds # x-axis using the uniform of the ROC curve for cost ratios costnomin <- c(1:(K1+1)) costnomin[1] <- 0 costnomin[K1+1] <- 1 for (i in 2:K1) { costnomin[i] <- 1 * pi1*(G1nomin[i]-G1nomin[i-1]) / (pi0*(G0nomin[i]-G0nomin[i-1]) + pi1*(G1nomin[i]-G1nomin[i-1])) } # x-axis using the uniform of the convex hull for cost ratios cost <- c(1:(hc+1)) cost[1] <- 0 cost[hc+1] <- 1 for (i in 2:hc) { cost[i] <- 1 * pi1*(G1[i]-G1[i-1]) / (pi0*(G0[i]-G0[i-1]) + pi1*(G1[i]-G1[i-1])) } # x-axis using the beta(2,2) of the convex hull for cost ratios costBeta22 <- pbeta(cost, 2, 2) # x-axis using the uniform of the convex hull for skews costnorm <- c(1:(hc+1)) costnorm[1] <- 0 costnorm[hc+1] <- 1 for (i in 2:hc) { costnorm[i] <- 1 * (G1[i]-G1[i-1]) / ((G0[i]-G0[i-1]) + (G1[i]-G1[i-1])) } # x-axis using the beta(2,2) of the convex hull for skews costnormBeta22 <- pbeta(costnorm, 2, 2) inpnorep <- 1:n0n1 # removes repeated values from inp[,2] j <- 1 olda <- -1 for (i in 1:n0n1) { a <- inp[i,2] if ((a != olda) || (i == 1)) { inpnorep[j] <- a olda <- a j <- j+1 } } # j-1 should be equal to K1 here inpnorep <- inpnorep[1:(K1-1)] # x-axis using the uniform of the Brier thresholds for cost ratios costprobrep <- c(1:(K1rep+1)) costprobrep[1] <- 0 costprobrep[K1rep+1] <- 1 for (i in 2:K1rep) { costprobrep[i] <- 1 * inp[K1rep-i+1,2] } costprobnorep <- c(1:(K1+1)) costprobnorep[1] <- 0 costprobnorep[K1+1] <- 1 for (i in 2:K1) { costprobnorep[i] <- 1 * inpnorep[K1-i+1] # costprobnorep[i] <- 1-inpnorep[i-1] } # REGULAR STEPS! costregular <- c(1:(K1rep+1)) costregular[1] <- 0 costregular[K1rep+1] <- 1 for (i in 2:K1rep) { costregular[i] <- 1 * (i-1)/K1rep } ############################################### ############################################### ######## Expected cost Qprobnew (Brier) ####### ############## for cost ratio ################# # This does not work. Skip until "NEW APPROACH" # costhere <- costprobnorep # K1here <- K1 # # Qprob <- c(1:(K1here+1)) # for (i in 1:K1here) # { # prova <- costhere[i] # prova0 <- G0nomin[i] # prova1 <- G1nomin[i] # Qprob[i] <- bfactor * (prova*pi0*(1-prova0) + (1-prova)*pi1*prova1) # } # Qprob[(K1here+1)] <- 0 # # # costhere2 <- costprobnorep # if (DRAW_PROB_CURVES) # plot(costhere2,Qprob, type= "o", main= "Expected cost by cost ratio ",xlab= COSTRATIO,ylab= LOSS, col="magenta") # # AUCCprob # # AUCCprob <- 0 # for (i in 2:(K1here+1)) # { # AUCCprob <- AUCCprob + (costhere2[i]-costhere2[i-1]) * (Qprob[i]+Qprob[i-1])/2 # } # NEW APPROACH K1new <- K1*2 costprobnew <- c(1:K1new) Qprobnew <- c(1:K1new) Qprobnew0 <- c(1:K1new) Qprobnew1 <- c(1:K1new) for (i in 2:(K1new-1)) { costprobnew[i] <- costprobnorep[trunc(i/2)+1] prova <- costprobnew[i] prova0 <- G0nomin[i] prova1 <- G1nomin[i] Qprobnew[i] <- bfactor * (prova*pi0*(1-G0nomin[trunc((i+1)/2)]) + (1-prova)*pi1*G1nomin[trunc((i+1)/2)]) Qprobnew0[i] <- bfactor * (prova*pi0*(1-G0nomin[trunc((i+1)/2)])) Qprobnew1[i] <- bfactor * (1-prova)*pi1*G1nomin[trunc((i+1)/2)] } Qprobnew[1] <- 0 Qprobnew[K1new] <- 0 Qprobnew0[1] <- 0 Qprobnew0[K1new] <- 0 Qprobnew1[1] <- 0 Qprobnew1[K1new] <- 0 costprobnew[1] <- 0 costprobnew[K1new] <- 1 # This postprocess is to eliminate segments for one class which are generated by the other, so each class Brier curve will only show its point (the shape of the curve is the same) costprobnew0ok <- c(1:K1new) Qprobnew0ok <- c(1:K1new) K1new0 <- 2 costprobnew1ok <- c(1:K1new) Qprobnew1ok <- c(1:K1new) K1new1 <- 2 for (i in 2:(K1new-1)) { if (G0nomin[trunc((i)/2)] != G0nomin[trunc((i+2)/2)]) { costprobnew0ok[K1new0] <- costprobnew[i] Qprobnew0ok[K1new0] <- Qprobnew0[i] K1new0 <- K1new0 + 1 } if (G1nomin[trunc((i)/2)] != G1nomin[trunc((i+2)/2)]) { costprobnew1ok[K1new1] <- costprobnew[i] Qprobnew1ok[K1new1] <- Qprobnew1[i] K1new1 <- K1new1 + 1 } } Qprobnew0ok[1] <- 0 Qprobnew0ok[K1new0] <- 0 Qprobnew0ok <- Qprobnew0ok[1:K1new0] costprobnew0ok[1] <- 0 costprobnew0ok[K1new0] <- 1 costprobnew0ok <- costprobnew0ok[1:K1new0] Qprobnew1ok[1] <- 0 Qprobnew1ok[K1new1] <- 0 Qprobnew1ok <- Qprobnew1ok[1:K1new1] costprobnew1ok[1] <- 0 costprobnew1ok[K1new1] <- 1 costprobnew1ok <- costprobnew1ok[1:K1new1] if (DRAW_COSTRATIO_PLOTS) { # only plots the space if (PLOT_SPACE) { tit <- paste(EXPECTEDLOSS,BYCOST) if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(c(0,1),c(0,0), ylim=c(0,YLIM), main= tit,xlab= COSTRATIO,ylab= LOSS, col="white") } if (DRAW_PROB_CURVES) { # if (!SET_YLIM) # plot(costprobnew,Qprobnew, type= "o", main= "Expected cost by cost ratio ",xlab= COSTRATIO,ylab= "Expected cost ", col="magenta") # else # plot(costprobnew,Qprobnew, ylim=c(0,YLIM), type= "o", main= "Expected cost by cost ratio ",xlab= COSTRATIO,ylab= "Expected cost ", col="magenta") lines(costprobnew,Qprobnew, type= TYPE_PROB_CURVES, lwd= LWD_PROB_CURVES, col="magenta", pch= PCH_PROB_CURVES) } if (DRAW_PROB_CURVES_PERCLASS) { # These draw more points than necessary # lines(costprobnew,Qprobnew0, type= TYPE_PROB_CURVES, lwd= LWD_PROB_CURVES, col="orange", pch= 24) # lines(costprobnew,Qprobnew1, type= TYPE_PROB_CURVES, lwd= LWD_PROB_CURVES, col="green", pch= 25) # These only plot the necessary points lines(costprobnew0ok,Qprobnew0ok, type= TYPE_PROB_CURVES, lwd= LWD_PROB_CURVES, col="red", pch= 24) lines(costprobnew1ok,Qprobnew1ok, type= TYPE_PROB_CURVES, lwd= LWD_PROB_CURVES, col="blue", pch= 25) } } # AUCCprobnew AUCCprobnew <- 0 for (i in 2:K1new) { AUCCprobnew <- AUCCprobnew + (costprobnew[i]-costprobnew[i-1]) * (Qprobnew[i]+Qprobnew[i-1])/2 } ############################################### ############################################### ###### Expected cost Q (minimal) ############## ############## for cost ratio ################# # CALCULATE THE MINIMUM LOSS VS c CURVE (UNNORMALISED, HAND) Q <- c(1:(hc+1)) for (i in 1:hc) { Q[i] <- bfactor * (cost[i]*pi0*(1-G0[i]) + (1-cost[i])*pi1*G1[i]) } Q[(hc+1)] <- 0 if (DRAW_COSTRATIO_PLOTS) { if (DRAW_OPTIMAL_CURVES) { if (OVERLAP_PROB_OPTIMAL) { points(cost,Q, type= "o", lwd=LWD_OPTIMAL_CURVES, pch = PCH_OPTIMAL_CURVES, col=COLOUR_OPTIMAL_CURVES, lty= LTY_OPTIMAL_CURVES) } else { tit <- paste(EXPECTEDLOSS, BYCOST) if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(cost,Q, type= "l", lwd=LWD_OPTIMAL_CURVES, pch = PCH_OPTIMAL_CURVES, main= tit ,xlab= COSTRATIO,ylab= LOSS, lty= LTY_OPTIMAL_CURVES, col=COLOUR_OPTIMAL_CURVES, ylim=c(0,YLIM)) } } } # AUCC AUCC <- 0 for (i in 2:(hc+1)) { AUCC <- AUCC + (cost[i]-cost[i-1]) * (Q[i]+Q[i-1])/2 } # RASTERISED VERSION (NOT USED) # rasterise the curve (cost,Q) resorast <- 1000 Qrast <- 1:resorast for (i in 1:hc) { ini <- trunc(cost[i]*resorast) fin <- trunc(cost[i+1]*resorast) len <- fin-ini if (len > 0) { for (j in ini:fin) { Qrast[j] <- ((j-ini)/len) * (Q[i+1] - Q[i]) + Q[i] # bfactor already included in Q. } } } costrast <- (0:(resorast-1))/resorast resovis <- 5 vecvis <- (0:resovis)/resovis veclabvis <- vecvis # plot(costrast,Qrast, type= "l", main= "Minimum loss by cost ratio (rasterised)",xlab= COSTRATIO,ylab= "Minimum achievable loss ", axes=FALSE) # axis(1, at=vecvis, lab=veclabvis) # axis(2) # box() ############################################### ############################################### ###### Expected cost Q (minimal) ############## ############## for cost ratio ################# ################ BETA(2,2) #################### # CALCULATE THE MINIMUM LOSS VS c CURVE (UNNORMALISED, HAND, Beta(2,2)) costrastBeta22 <- pbeta(costrast, ALPHA_PLOT, BETA_PLOT) if (DRAW_COSTRATIO_PLOTS) { if ((DRAW_OPTIMAL_CURVES) && (DRAW_BETA_CURVES)) { tit <- paste(EXPECTEDLOSS, BYCOST) if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(costrastBeta22,xlim=c(0,1),ylim=c(0,YLIM),Qrast, type= "l", main= tit,xlab= paste(COSTRATIO, BETA22),ylab= LOSS, lwd=LWD_OPTIMAL_CURVES, pch= PCH_OPTIMAL_CURVES, axes=FALSE, lty= LTY_OPTIMAL_CURVES, col=COLOUR_OPTIMAL_CURVES) } } reso <- 1000 vec <- (0:reso)/reso veclab <- pbeta(vec, ALPHA_PLOT, BETA_PLOT) resovis <- 5 vecvis <- (0:resovis)/resovis veclabvis <- (0:resovis)/resovis eps <- 5/ reso for (i in 1:resovis) { for (j in 1:reso) { if (abs(veclab[j] - vecvis[i]) < eps) { veclabvis[i] <- (trunc((j/reso)*100))/100 break } } } if (DRAW_COSTRATIO_PLOTS) { if ((DRAW_OPTIMAL_CURVES) && (DRAW_BETA_CURVES)) { axis(1, at=vecvis, lab=veclabvis) axis(2) box() } } AUCCBeta22 <- 0 for (i in 2:resorast) { AUCCBeta22 <- AUCCBeta22 + (costrastBeta22[i]-costrastBeta22[i-1]) * (Qrast[i]+Qrast[i-1])/2 } ############################################### ############################################### ###### Expected cost Qnomin (ROC) ############# ############## for cost ratio ################# # CALCULATE THE NON-MINIMUM LOSS VS c CURVE Qnomin <- c(1:(K1+1)) costnominreg <- costnomin costnominreg[2:(K1+1)] <- (0:(K1-1)) / (K1-1) costhere <- costnominreg for (i in 1:K1) { Qnomin[i] <- bfactor * (costhere[i]*pi0*(1-G0nomin[i]) + (1-costhere[i])*pi1*G1nomin[i]) } Qnomin[(K1+1)] <- 0 if (DRAW_COSTRATIO_PLOTS) { if (DRAW_NONOPTIMALROC_CURVES) { # plot(costhere,Qnomin,type= "l", main= "NON-Minimum loss by cost ratio ",xlab= COSTRATIO,ylab= "NON-Minimum achievable loss ", col="green") lines(costhere,Qnomin,type= "l", col="green") } } # AUCCnomin AUCCnomin <- 0 ampleAcc <- 0 for (i in 2:(K1+1)) { ample <- abs(costnomin[i]-costnomin[i-1]) ampleAcc <- ampleAcc + ample AUCCnomin <- AUCCnomin + ample * (Qnomin[i]+Qnomin[i-1])/2 } AUCCnomin <- AUCCnomin / ampleAcc ############################################### ############################################### ###### Expected cost Qnomintots (ALL) ######### ############## for cost ratio ################# if (!COSTLINESREP) { # Reduces the number of lines (eliminates repeated ones), but it computes the red line in a wrong way if there are ties. # AUCCnomintots xnomintots <- rep(0,K1*4) ynomintots <- rep(0,K1*4) Leftacc <- 0 Rightacc <- 0 Leftmax <- 0 Rightmax <- 0 AUCCnomintots <- 0 for (i in 1:K1) { Leftv <- G1nomin[i]*pi1 * bfactor Rightv <- (1 - G0nomin[i])*pi0 * bfactor w <- 1 # weight Leftacc <- Leftacc + w*Leftv Rightacc <- Rightacc + w*Rightv Leftmax <- max(Leftmax, Leftv) Rightmax <- max(Rightmax, Rightv) xnomintots[(i-1)*4+1] <- 0 xnomintots[(i-1)*4+2] <- 1 xnomintots[(i-1)*4+3] <- 1 xnomintots[(i-1)*4+4] <- 0 ynomintots[(i-1)*4+1] <- Leftv ynomintots[(i-1)*4+2] <- Rightv ynomintots[(i-1)*4+3] <- 0 ynomintots[(i-1)*4+4] <- 0 linia <- ( Rightv + Leftv) / 2 # print(linia) AUCCnomintots <- AUCCnomintots + linia # print(AUCCnomintots) } AUCCnomintots <- AUCCnomintots / K1 Leftacc <- Leftacc / K1 Rightacc <- Rightacc / K1 if (DRAW_COSTRATIO_PLOTS) { if (DRAW_LOSSLINE) { points(c(0,1),c(Leftacc,Rightacc), type= "l", lty= LTY_LOSSLINE, col = COLOUR_LOSSLINE, lwd= LWD_LOSSLINE) } if (DRAW_COST_LINES) { if (!(OVERLAP_NONOPTIMAL)) { tit <- paste(EXPECTEDLOSS, BYCOST) if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(xnomintots,ynomintots, type= "l", main= tit,xlab= COSTRATIO,ylab= LOSS, lty=2, col = COLOUR_COST_LINES) } else { points(xnomintots,ynomintots, type= "l", lty=2, col = COLOUR_COST_LINES) } if (DRAW_LOSSLINE) points(c(0,1),c(Leftacc,Rightacc), type= "l", lty=LTY_LOSSLINE, col = COLOUR_LOSSLINE, lwd= LWD_LOSSLINE) if (DRAW_COSTSQUARE) { points(c(0,1),c(0,0), type= "l", col = "grey", lwd=1) # horizontal points(c(0,0),c(0,Leftmax), type= "l", col = "grey", lwd=1) # left vertical # points(c(0,1),c(0,0), type= "l", col = "grey", lwd=1) points(c(1,1),c(0,Rightmax), type= "l", col = "grey", lwd=1) # right vertical # points(c(1,1),c(0,1), type= "l", col = "grey", lwd=1) } } if (DRAW_OPTIMAL_CURVES_AGAIN) { points(cost,Q, type= "o", lwd=LWD_OPTIMAL_CURVE_AGAIN, col=COLOUR_OPTIMAL_CURVE_AGAIN, lty= LTY_OPTIMAL_CURVES, pch=PCH_OPTIMAL_CURVES) } if (DRAW_PROB_CURVES_AGAIN) { lines(costprobnew,Qprobnew, type= TYPE_PROB_CURVES, lwd=LWD_PROB_CURVE_AGAIN, col=COLOUR_PROB_CURVE_AGAIN, pch= PCH_PROB_CURVES) } } } ############################################### ############################################### ###### Expected cost Qnomintots (ALL) ######### ############## for cost ratio ################# if (COSTLINESREP) { # Preserves repeated lines. It computes the red line correctly with ties. # AUCCnomintots xnomintotsrep <- rep(0,K1rep*4) ynomintotsrep <- rep(0,K1rep*4) Leftaccrep <- 0 Rightaccrep <- 0 Leftmaxrep <- 0 Rightmaxrep <- 0 AUCCnomintotsrep <- 0 for (i in 1:K1rep) { Leftvrep <- G1nominrep[i]*pi1 * bfactor Rightvrep <- (1 - G0nominrep[i])*pi0 * bfactor w <- 1 # weight Leftaccrep <- Leftaccrep + w*Leftvrep Rightaccrep <- Rightaccrep + w*Rightvrep Leftmaxrep <- max(Leftmaxrep, Leftvrep) Rightmaxrep <- max(Rightmaxrep, Rightvrep) xnomintotsrep[(i-1)*4+1] <- 0 xnomintotsrep[(i-1)*4+2] <- 1 xnomintotsrep[(i-1)*4+3] <- 1 xnomintotsrep[(i-1)*4+4] <- 0 ynomintotsrep[(i-1)*4+1] <- Leftvrep ynomintotsrep[(i-1)*4+2] <- Rightvrep ynomintotsrep[(i-1)*4+3] <- 0 ynomintotsrep[(i-1)*4+4] <- 0 liniarep <- ( Rightvrep + Leftvrep) / 2 # print(linia) AUCCnomintotsrep <- AUCCnomintotsrep + liniarep # print(AUCCnomintots) } AUCCnomintotsrep <- AUCCnomintotsrep / K1rep Leftaccrep <- Leftaccrep / K1rep Rightaccrep <- Rightaccrep / K1rep if (DRAW_COSTRATIO_PLOTS) { if (DRAW_LOSSLINE) { points(c(0,1),c(Leftaccrep,Rightaccrep), type= "l", lty= LTY_LOSSLINE, col = COLOUR_LOSSLINE, lwd= LWD_LOSSLINE) } if (DRAW_COST_LINES_AGAIN) { if (!(OVERLAP_NONOPTIMAL)) { tit <- paste(EXPECTEDLOSS, BYCOST) if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(xnomintotsrep,ynomintotsrep, type= "l", main= tit,xlab= COSTRATIO,ylab= LOSS, lty=2, col = COLOUR_COST_LINES) } else { points(xnomintotsrep,ynomintotsrep, type= "l", lty=2, col = COLOUR_COST_LINES) } if (DRAW_LOSSLINE) points(c(0,1),c(Leftaccrep,Rightaccrep), type= "l", lty=LTY_LOSSLINE, col = COLOUR_LOSSLINE, lwd= LWD_LOSSLINE) if (DRAW_COSTSQUARE) { points(c(0,1),c(0,0), type= "l", col = "grey", lwd=1) # horizontal points(c(0,0),c(0,Leftmaxrep), type= "l", col = "grey", lwd=1) # left vertical # points(c(0,1),c(0,0), type= "l", col = "grey", lwd=1) points(c(1,1),c(0,Rightmaxrep), type= "l", col = "grey", lwd=1) # right vertical # points(c(1,1),c(0,1), type= "l", col = "grey", lwd=1) } } if (DRAW_OPTIMAL_CURVES_AGAIN) { points(cost,Q, type= "o", lwd=LWD_OPTIMAL_CURVE_AGAIN, col=COLOUR_OPTIMAL_CURVE_AGAIN, lty= LTY_OPTIMAL_CURVES, pch=PCH_OPTIMAL_CURVES) } if (DRAW_PROB_CURVES_AGAIN) { lines(costprobnew,Qprobnew, type= TYPE_PROB_CURVES, lwd=LWD_PROB_CURVE_AGAIN, col=COLOUR_PROB_CURVE_AGAIN, pch= PCH_PROB_CURVES) } } } ############################################### ############################################### ##### Expected cost Qprobnewnorm (Brier) ###### ################ for SKEW ##################### K1new <- K1*2 costprobnewnorm <- c(1:K1new) Qprobnewnorm <- c(1:K1new) Qprobnewnorm0 <- c(1:K1new) Qprobnewnorm1 <- c(1:K1new) for (i in 2:(K1new-1)) { p <- costprobnorep[trunc(i/2)+1] if (BRIERCURVES_WITH_FACTOR) { costprobnewnorm[i] <- p*(pi0) /((p*(pi0)+(1-p)*(pi1))) # Old Brier Curves } else costprobnewnorm[i] <- p # Corrected Brier Curves prova <- costprobnewnorm[i] Qprobnewnorm[i] <- bfactor * 0.5 * (prova*(1-G0nomin[trunc((i+1)/2)]) + (1-prova)*G1nomin[trunc((i+1)/2)]) Qprobnewnorm0[i] <- bfactor * 0.5 * (prova*(1-G0nomin[trunc((i+1)/2)])) Qprobnewnorm1[i] <- bfactor * 0.5 * ((1-prova)*G1nomin[trunc((i+1)/2)]) } Qprobnewnorm[1] <- 0 Qprobnewnorm[K1new] <- 0 Qprobnewnorm0[1] <- 0 Qprobnewnorm0[K1new] <- 0 Qprobnewnorm1[1] <- 0 Qprobnewnorm1[K1new] <- 0 costprobnewnorm[1] <- 0 costprobnewnorm[K1new] <- 1 # This postprocess is to eliminate segments for one class which are generated by the other, so each class Brier curve will only show its point (the shape of the curve is the same) costprobnewnorm0ok <- c(1:K1new) Qprobnewnorm0ok <- c(1:K1new) K1new0 <- 2 costprobnewnorm1ok <- c(1:K1new) Qprobnewnorm1ok <- c(1:K1new) K1new1 <- 2 for (i in 2:(K1new-1)) { if (G0nomin[trunc((i)/2)] != G0nomin[trunc((i+2)/2)]) { costprobnewnorm0ok[K1new0] <- costprobnewnorm[i] Qprobnewnorm0ok[K1new0] <- Qprobnewnorm0[i] K1new0 <- K1new0 + 1 } if (G1nomin[trunc((i)/2)] != G1nomin[trunc((i+2)/2)]) { costprobnewnorm1ok[K1new1] <- costprobnewnorm[i] Qprobnewnorm1ok[K1new1] <- Qprobnewnorm1[i] K1new1 <- K1new1 + 1 } } Qprobnewnorm0ok[1] <- 0 Qprobnewnorm0ok[K1new0] <- 0 Qprobnewnorm0ok <- Qprobnewnorm0ok[1:K1new0] costprobnewnorm0ok[1] <- 0 costprobnewnorm0ok[K1new0] <- 1 costprobnewnorm0ok <- costprobnewnorm0ok[1:K1new0] Qprobnewnorm1ok[1] <- 0 Qprobnewnorm1ok[K1new1] <- 0 Qprobnewnorm1ok <- Qprobnewnorm1ok[1:K1new1] costprobnewnorm1ok[1] <- 0 costprobnewnorm1ok[K1new1] <- 1 costprobnewnorm1ok <- costprobnewnorm1ok[1:K1new1] if (DRAW_SKEW_PLOTS) { # only plots the space if (PLOT_SPACE) { tit <- paste(EXPECTEDLOSS, BYSKEW) if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(c(0,1),c(0,0), ylim=c(0,YLIM), main= tit,xlab= SKEW,ylab= LOSS, col="white") } if (DRAW_PROB_CURVES) { # if (!SET_YLIM) # plot(costprobnewnorm,Qprobnewnorm, type= "o", main= "Expected cost by skew ",xlab= "skew ",ylab= "Expected cost ", col="magenta", lwd= LWD_PROB_CURVES, pch= PCH_PROB_CURVES) # else # plot(costprobnewnorm,Qprobnewnorm, ylim=c(0,YLIM), type= "o", main= "Expected cost by skew ",xlab= "skew ",ylab= "Expected cost ", col="magenta") lines(costprobnewnorm,Qprobnewnorm, ylim=c(0,YLIM), type= TYPE_PROB_CURVES, col="magenta",lwd= LWD_PROB_CURVES, pch= PCH_PROB_CURVES) } if (DRAW_PROB_CURVES_PERCLASS) { # These plot more points than necessary (although the curve is the same) # lines(costprobnewnorm,Qprobnewnorm0, type= TYPE_PROB_CURVES, col="orange", lwd= LWD_PROB_CURVES, pch= 24) # lines(costprobnewnorm,Qprobnewnorm1, type= TYPE_PROB_CURVES, col="green", lwd= LWD_PROB_CURVES, pch= 25) # These only plot the necessary points. lines(costprobnewnorm0ok,Qprobnewnorm0ok, type= TYPE_PROB_CURVES, col="red", lwd= LWD_PROB_CURVES, pch= 24) lines(costprobnewnorm1ok,Qprobnewnorm1ok, type= TYPE_PROB_CURVES, col="blue", lwd= LWD_PROB_CURVES, pch= 25) } } # AUCCprobnewnorm AUCCprobnewnorm <- 0 for (i in 2:K1new) { AUCCprobnewnorm <- AUCCprobnewnorm + (costprobnewnorm[i]-costprobnewnorm[i-1]) * (Qprobnewnorm[i]+Qprobnewnorm[i-1])/2 } ############################################### ############################################### #### Expected cost Qprobmaxnorm (uniform) ##### ################## for SKEW ################### # COST MAX (Peter's new derivation, uniform) NORMALISED # HERE, IT MUST BE DIFFERENT FROM cost ratios, SINCE WE CAN ASSUME A UNIFORM DISTRIBUTION ON SKEWS OR A UNIFORM DISTRIBUTION ON COSTS. # UNIFORM: THE EASY WAY: (UNIFORM COSTS) # This way is wrong. # Leftuninorm <- pi1*(1-s1) # Rightuninorm <- pi0*s0 # AUCCprobmaxnorm <- (Leftuninorm + Rightuninorm) / 2 # UNIFORM: THE EASY WAY: (UNIFORM SKEWS) Leftuninorm <- 0.5*(1-s1) Rightuninorm <- 0.5*s0 AUCCprobmaxnorm <- (Leftuninorm + Rightuninorm) / 2 if (DRAW_SKEW_PLOTS) { if (DRAW_ORANGE) { if (OVERLAP_PROB_OPTIMAL) { points(c(0,1),c(Leftuninorm,Rightuninorm), type= "o", pch=22, lty=2, col="orange") } else { tit <- "Uni loss by skew" if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(c(0,1),c(Leftuninorm,Rightuninorm), type= "o", pch=22, lty=2, main= tit,xlab= SKEW,ylab= "Uni loss ", col="orange") } } } ############################################### ############################################### ###### Expected cost Qnorm (minimal) ########## ################## for SKEW ################### # CALCULATE THE MINIMUM LOSS VS c CURVE (NORMALISED) Qnorm <- c(1:(hc+1)) for (i in 1:hc) { #old Qnorm[i] <- costnorm[i]*(1-G0[i]) + (1-costnorm[i])*G1[i] Qnorm[i] <- bfactor * 0.5 * (costnorm[i]*(1-G0[i]) + (1-costnorm[i])*G1[i]) } Qnorm[(hc+1)] <- 0 if (DRAW_SKEW_PLOTS) { if (DRAW_OPTIMAL_CURVES) { if (OVERLAP_PROB_OPTIMAL) { points(costnorm,Qnorm, lwd=LWD_OPTIMAL_CURVES ,pch= PCH_OPTIMAL_CURVES, type= "o", col=COLOUR_OPTIMAL_CURVES, lty= LTY_OPTIMAL_CURVES) } else { tit <- paste(EXPECTEDLOSS, BYSKEW) if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(costnorm,Qnorm,type= "l", main= tit, xlab= "skew ",ylab= LOSS, lwd= LWD_OPTIMAL_CURVES, pch= PCH_OPTIMAL_CURVES, lty= LTY_OPTIMAL_CURVES, col=COLOUR_OPTIMAL_CURVES, ylim=c(0,YLIM)) } } } # AUCCnorm AUCCnorm <- 0 for (i in 2:(hc+1)) { AUCCnorm <- AUCCnorm + (cost[i]-cost[i-1]) * (Qnorm[i]+Qnorm[i-1])/2 } # RASTERISED VERSION (NOT USED) # rasterise the curve (costnorm,Qnorm) resorast <- 1000 Qnormrast <- 1:resorast for (i in 1:hc) { ini <- trunc(costnorm[i]*resorast) fin <- trunc(costnorm[i+1]*resorast) len <- fin-ini if (len > 0) { for (j in ini:fin) { Qnormrast[j] <- ((j-ini)/len) * (Qnorm[i+1] - Qnorm[i]) + Qnorm[i] # bfactor already included in Qnorm } } } costnormrast <- (0:(resorast-1))/resorast resovis <- 5 vecvis <- (0:resovis)/resovis veclabvis <- vecvis # plot(costrast,Qrast, type= "l", main= "Minimum loss by cost ratio (rasterised)",xlab= COSTRATIO,ylab= "Minimum achievable loss ", axes=FALSE) # axis(1, at=vecvis, lab=veclabvis) # axis(2) # box() ############################################### ############################################### ###### Expected costnorm Q (minimal) ########## ################# for SKEW #################### ################ BETA(2,2) #################### # CALCULATE THE MINIMUM LOSS VS c CURVE (UNNORMALISED, HAND, Beta(2,2)) costnormrastBeta22 <- pbeta(costnormrast, ALPHA_PLOT, BETA_PLOT) if (DRAW_SKEW_PLOTS) { if ((DRAW_OPTIMAL_CURVES) && (DRAW_BETA_CURVES)) { tit <- paste(EXPECTEDLOSS, BYSKEW) if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(costnormrastBeta22,Qnormrast, ylim=c(0,YLIM), xlim=c(0,1), type= "l", main= tit,xlab= paste(SKEW, BETA22),ylab= LOSS, lwd=LWD_OPTIMAL_CURVES, pch= PCH_OPTIMAL_CURVES, axes=FALSE, lty= LTY_OPTIMAL_CURVES, col=COLOUR_OPTIMAL_CURVES) } } reso <- 1000 vec <- (0:reso)/reso veclab <- pbeta(vec, ALPHA_PLOT, BETA_PLOT) resovis <- 5 vecvis <- (0:resovis)/resovis veclabvis <- (0:resovis)/resovis eps <- 5/ reso for (i in 1:resovis) { for (j in 1:reso) { if (abs(veclab[j] - vecvis[i]) < eps) { veclabvis[i] <- (trunc((j/reso)*100))/100 break } } } if (DRAW_SKEW_PLOTS) { if ((DRAW_OPTIMAL_CURVES) && (DRAW_BETA_CURVES)) { axis(1, at=vecvis, lab=veclabvis) axis(2) box() } } AUCCnormBeta22 <- 0 for (i in 2:resorast) { AUCCnormBeta22 <- AUCCnormBeta22 + (costnormrastBeta22[i]-costnormrastBeta22[i-1]) * (Qnormrast[i]+Qnormrast[i-1])/2 } ############################################### ############################################### ###### Expected cost Qnomin (ROC) ############# ################## for SKEW ################### # Not done ############################################### ############################################### ###### Expected cost nomintotsnorm (ALL) ###### ################# for SKEW #################### if (!COSTLINESREP) { # Reduces the number of lines (eliminates repeated ones), but it computes the red line in a wrong way if there are ties. # AUCCnomintots xnomintotsnorm <- rep(0,K1*4) ynomintotsnorm <- rep(0,K1*4) Leftaccnorm <- 0 Rightaccnorm <- 0 AUCCnomintotsnorm <- 0 for (i in 1:K1) { Leftvnorm <- G1nomin[i]*0.5 * bfactor Rightvnorm <- (1 - G0nomin[i])*0.5 * bfactor w <- 1 # weight Leftaccnorm <- Leftaccnorm + w*Leftvnorm Rightaccnorm <- Rightaccnorm + w*Rightvnorm xnomintotsnorm[(i-1)*4+1] <- 0 xnomintotsnorm[(i-1)*4+2] <- 1 xnomintotsnorm[(i-1)*4+3] <- 1 xnomintotsnorm[(i-1)*4+4] <- 0 ynomintotsnorm[(i-1)*4+1] <- Leftvnorm ynomintotsnorm[(i-1)*4+2] <- Rightvnorm ynomintotsnorm[(i-1)*4+3] <- 0 ynomintotsnorm[(i-1)*4+4] <- 0 linianorm <- ( Rightvnorm + Leftvnorm) / 2 # print(linianorm) AUCCnomintotsnorm <- AUCCnomintotsnorm + linianorm # print(AUCCnomintotsnorm) } AUCCnomintotsnorm <- AUCCnomintotsnorm / K1 Leftaccnorm <- Leftaccnorm / K1 Rightaccnorm <- Rightaccnorm / K1 if (DRAW_SKEW_PLOTS) { if (DRAW_LOSSLINE) points(c(0,1),c(Leftaccnorm,Rightaccnorm),type= "l", lty=LTY_LOSSLINE, col = COLOUR_LOSSLINE, lwd=LWD_LOSSLINE) if (DRAW_COST_LINES) { if (!(OVERLAP_NONOPTIMAL)) { tit <- paste(LOSS,BYSKEW) if (NOTITLE) { par(mar=MARGINSNOTITLE) tit <- "" } plot(xnomintotsnorm,ynomintotsnorm,type= "l", main= tit,xlab= "skew ",ylab= COST, lty=2, col=COLOUR_COST_LINES) } else { points(xnomintotsnorm,ynomintotsnorm,type= "l", lty=2, col=COLOUR_COST_LINES) } if (DRAW_LOSSLINE) { points(c(0,1),c(Leftaccnorm,Rightaccnorm),type= "l", lty=LTY_LOSSLINE, col = COLOUR_LOSSLINE, lwd=LWD_LOSSLINE) } if (DRAW_COSTSQUARE) { # points(c(0,max(ynomintotsnorm)),c(0,0), type= "l", col = "grey", lwd=1) points(c(0,1),c(0,0), type= "l", col = "grey", lwd=1) points(c(0,0),c(0,1), type= "l", col = "grey", lwd=1) # points(c(1,max(ynomintotsnorm)),c(0,1), type= "l", col = "grey", lwd=1) points(c(1,1),c(0,1), type= "l", col = "grey", lwd=1) } } # col = "dark red" if (DRAW_OPTIMAL_CURVES_AGAIN) { points(costnorm,Qnorm, lwd=LWD_OPTIMAL_CURVE_AGAIN ,type= "o", col=COLOUR_OPTIMAL_CURVE_AGAIN, lty= LTY_OPTIMAL_CURVES, pch=PCH_OPTIMAL_CURVES) } if (DRAW_PROB_CURVES_AGAIN) { lines(costprobnewnorm,Qprobnewnorm, type= TYPE_PROB_CURVES, lwd=LWD_PROB_CURVE_AGAIN, col=COLOUR_PROB_CURVE_AGAIN, pch= PCH_PROB_CURVES) } } } ############################################### ############################################### ############ CLOSING OUTPUT (PDF) ############# if (PDF) { if (CLOSEPDF) { dev.off() } } ############################################### ############################################### ############## Show statistics ################ print('Results:') print('MSE') print(MSE(inp)) print('AUC') print(AUC) ExpLossZ(inp,ProbThresFunZ) # wBrier score (probabilistic threshold). Equals AUCCprobnewnorm ExpLossC(inp,ProbThresFunC) # Brier score (probabilistic threshold) print("SCRIPT FINISHED") ###############################################