############################################################################## # # This is R code for calculating and drawing RROC curves (ROC curves for regression models) and their convex hulls. # # For information about RROC curves, see the following paper: # # - J. Hernandez-Orallo "ROC Curves for Regression", submitted (send me a message to get a copy) # # The program calculates and draws: # - RROC curves # - Convex hull of several curves. # - Their areas # # This code has been developed by # JOSE HERNANDEZ-ORALLO, UNIVERSITAT POLITECNICA DE VALENCIA, SPAIN # jorallo@dsic.upv.es # # COPYRIGHT: # Except for peer-reviewing processes and checks related to reproduceability of results, any other use of this software (even non-profit, academic uses) *should* be done after contacting the author first. I will most probably grant permission to use it freely, but I want to keep track of who is using it (while some papers are under review). # # VERSION HISTORY: # - V.0.8 8 Oct 2012. First stable version with most functionalities. # - V.0.9 23 Oct 2012. It includes the possibility of normalising the axes (/N). # - V.1.0 24 Oct 2012. Calculates the convex hull # - V.1.0.1 11 Nov 2012. We improve the documentation and make it available # # FUTURE FEATURES # - It would be nice to draw isometrics and find the point where it touches a curve. # ############################################################################## ############################################################################## ################ FUNCTIONS PROVIDED AND A SHORT EXPLANATION ################## ################ (more details on each function definition) ################## ############################################################################## # # RROC_curve(...) : Returns a RROC curve of a regression model and (optionally) plots it on a previously created RROC space # # RROC_convexhull(...): Returns convex hull of a set of points in RROC space and (optionally) plots it on a previously created RROC space # # plot_RROCspace(...): Plots the RROC space # # cal_regressor(...): Calibrates a regression model by adding a constant shift to get 0 bias (overpredictions = underpredictions) # ############################################################################## ############################################################################## ############################# CODE ORGANISATION ############################## # # The firs part includes the general OPTIONS, next the defined FUNCTIONS. # Finally, some sample code ("MAIN)") you can customise to plot some examples. # At the end there are more sample material that I used for the paper above. # ############################################################################## ###################### ###### OPTIONS ####### ###################### # Graphic options NEWPLOT <- TRUE # Draws RROC space. Use whenever you want to start a new plot. If you want to draw curves on an existing plot, set this to FALSE NORMALISE_AXES <- FALSE # Divide x-axis and y-axis by n (OVER/n and UNDER/n) # PDF options PDFOPEN <- FALSE # If the plots are output on a PDF file PDFCLOSE <- PDFOPEN # Close the PDF file. This should match PDFOPEN, except when you want to draw several curves before closing. PDFheight= 5.5 # 7 is the default, so 14 makes it double higher than wide, 5 makes letters bigger (in proportion) for just one plot PDFwidth= 5 # (as above for width) 7 by default pdfname <- "RROCcurve.pdf" # File name # Directories where the PDF files will be output. Customise this for your computer. WORKDIR <- "C:/Users/jorallo/Desktop/Regression ROC Curves/script" # casa #WORKDIR <- "C:/Documents and Settings/jorallo.DSIC/Escritorio/Regression ROC Curves/script" # poli setwd(WORKDIR) ###################### ##### FUNCTIONS ###### ###################### #### RROC_curve #### # # Returns a RROC curve of a regression model and (optionally) plots it on a previous created RROC space # #### Arguments: # predicted: an array of predicted values # actual: an array of true values (size of predicted and actual should match) # mytext: text used to accompany the curve. If empty nothing is shown. # plot_point: if TRUE, it plots the point which corresponds to shift=0 (the crisp regression model). # plot_rectangle: if TRUE, shows the rectangle from the point which corresponds to shift=0 to RROC heaven (0,0). The perimeter of this rectangle equals twice the absolute error. # plot_curve: if TRUE, plots the RROC curve on a previously created RROC curve. Set to FALSE if you only want to get the points of curve. # colour1: colour used for points, rectangles and old way of calculating the curve (still used for other shift functions) # colour2: colour used for the curve # shiftfunction: we can provide a function to calculate the shift. If NULL, the usual RROC curves is plotted. # normalise_axes: divide x-axis and y-axis by N # N: Value used to normalise the axes (if (normalise_axes==FALSE) this value is ignored). Usually, for normalised curves, N has to be set to length(predicted). # xlim: size of the RROC space in the x-axis # ylim: size of the RROC space in the x-axis # USE_SD: if TRUE uses SD to calculate a relative shift. It is FALSE by default. # DRAW_CIRCLES: if TRUE shows a small circle for each point. It is TRUE by default. # #### Returns # The curve as a bidimensional matrix with the X,Y points of the curve. # RROC_curve <- function(predicted, actual, mytext, plot_point, plot_rectangle, plot_curve, colour1, colour2, shiftfunction=NULL, normalise_axes=FALSE, N, xlim0, ylim0, USE_SD=FALSE, DRAW_CIRCLES= TRUE) { nex <- length(predicted) # Number of examples orig_error <- predicted-actual # Calculates residuals (error vector) error_range <- max(orig_error) - min(orig_error) # Calculates the error range print("*************************************") print("Regression model:") print(rbind(predicted,actual,orig_error)) orig_over <- sum(orig_error[which(orig_error > 0)]) # Calculates OVER orig_under <- sum(orig_error[which(orig_error <= 0)]) # Calculates UNDER print("Original OVER and UNDER") print(orig_over) print(orig_under) mae <- mean(abs(orig_error)) # Calculates Mean Absolute Error mse <- mean(orig_error^2) # Calculates Mean Squared Error print("mae and mse") print(mae) print(mse) if (!normalise_axes) N <- 1 # In case we want to plot the 0-bias point and rectangle # shift=0 if (plot_point) { points(orig_over/N, orig_under/N, pch=7, col=colour1) # pch=2, 7, 15, circles: 19, 21 } if (plot_rectangle) { lines(c(orig_over/N,orig_over/N,xlim0*2), c(-ylim0*2, orig_under/N,orig_under/N), col=colour1) } if (!is.null(mytext)) { text((orig_over + xlim0/20)/N, (orig_under - ylim0/66.666666)/N, mytext, col=colour1, cex=0.75) } # Here we decide which method we use to draw the curve. if (is.null(shiftfunction)) { # If the shiftfunction is not specified (usual RROC curves) then the SEGMENTWISEMETHOD is used (the algorithm in the paper) POINTWISEMETHOD <- FALSE POINTWISECURVE <- FALSE # This can only be TRUE if POINTWISEMETHOD is TRUE SEGMENTWISEMETHOD <- TRUE } else { # This method just samples the space. It is valid for any shift function, but plots are constructed point by point and calculations are approximate. POINTWISEMETHOD <- TRUE POINTWISECURVE <- TRUE # This can only be TRUE if POINTWISEMETHOD is TRUE SEGMENTWISEMETHOD <- FALSE } ### METHOD 1 (NOT-COMMONLY USED NOW, ONLY FOR NON-USUAL (NON-OPTIMAL) SHIFT FUNCTIONS) if (POINTWISEMETHOD) { # This method is used when the shift function is not constant # This was the old method before developing the algorithm and now it is only used for special shift functions. RESO <- 2000 # This the resolution used. The greater the finer the curve is (the slower it is to draw, too) mins <- -error_range*1.01 # the lowest shift we will use (a little bit higher, 1%, than the error_range) maxs <- +error_range*1.01 # the highest shift we will use (a little bit higher, 1%, than the error_range) shift <- (((1:RESO) - RESO/2)/ (RESO)) * (maxs-mins) # how the shifts range under <- NULL # This vector accumulates underestimations and it is used to draw the curve over <- NULL # This vector accumulates overestimations and it is used to draw the curve shmae <- NULL shmse <- NULL underc <- -1 overc <- -1 ROCpointsSW_X <- NULL # This vector finally gets the curve (X) ROCpointsSW_Y <- NULL # This vector finally gets the curve (X) l <- 1 ROCpointsSW_X[l] <- 0 ROCpointsSW_Y[l] <- Inf for (i in 1:RESO) { # Use confidence modifier if (USE_SD) { prednew <- predicted + shift[i] * sd } else { if (is.null(shiftfunction)) { prednew <- predicted + shift[i] # Constant shift } else { prednew <- shiftfunction(predicted, shift[i]) } } error <- prednew - actual newl <- FALSE overv <- error[which(error > 0)] c <- length(overv) if (overc != c) { newl <- TRUE } overc <- c if (overc == 0) { over[i] <- 0 } else { over[i] <- sum(overv) } underv <- error[which(error <= 0)] # ! lower than equal here c <- length(underv) if (underc != c) { newl <- TRUE } underc <- c if (underc == 0) { under[i] <- 0 } else { under[i] <- sum(underv) } if (newl) { l <- l+1 ROCpointsSW_X[l] <- over[i] ROCpointsSW_Y[l] <- under[i] } shmae[i] <- (over[i] + under[i]) / nex shmse[i] <- (over[i] + under[i])^2 / nex } if (!newl) { l <- l+1 ROCpointsSW_X[l] <- over[i] ROCpointsSW_Y[l] <- under[i] } l <- l+1 ROCpointsSW_X[l] <- -Inf ROCpointsSW_Y[l] <- 0 AreaPW <- 0 AccLength <- 0 for (i in 2:RESO) { Length <- -(under[i]-under[i-1]) AccLength <- AccLength + Length AreaPW <- AreaPW - (Length)*(over[i] + over[i-1])/2 } print("Without ties: lines = nex + 2 + 2(infty lines)") print(length(ROCpointsSW_X)) print(nex) print("Area (pointwise, i.e., approximate):") print(AreaPW) if (plot_curve && POINTWISECURVE) { lines(over/N, under/N, col=colour1) } } # END POINTWISEMETHOD # A BETTER WAY TO DRAW THE CURVES FOR THE USUAL (OPTIMAL) SHIFT FUNCTION if (SEGMENTWISEMETHOD) { # This way can only be used when the shift function is constant and chosen optimally errors <- sort(orig_error, decreasing=TRUE) # sorts the errors n <- length(errors) bignumber <- sum(abs(errors))*1000 # We use a big number according to the scale of the errors (we cannot use infinite to draw) # Draws the curve from bottom-left corner to top-right corner ROCpointsSW_X <- NULL ROCpointsSW_Y <- NULL ROCpointsSW_X[1] <- 0 ROCpointsSW_Y[1] <- -bignumber # -Inf for(i in 1:(n)) { errorsi <- errors - errors[i] ROCpointsSW_X[i+1] <- sum(errorsi[which(errorsi > 0)]) ROCpointsSW_Y[i+1] <- sum(errorsi[which(errorsi <= 0)]) } ROCpointsSW_X[n+2] <- bignumber # Inf ROCpointsSW_Y[n+2] <- 0 if (plot_curve) { if (DRAW_CIRCLES) { mytype <- "o" } else { mytype <- "l" } lines(ROCpointsSW_X/N, ROCpointsSW_Y/N, col=colour2, type=mytype) } AreaSW <- 0 for(i in 3:(n+1)) { AreaSW <- AreaSW + ((-ROCpointsSW_Y[i]-ROCpointsSW_Y[i-1])/2)*(ROCpointsSW_X[i]-ROCpointsSW_X[i-1]) } # print(ROCpointsSW_X) # print(ROCpointsSW_Y) print("Area (segmentwise, i.e., exact):") print(AreaSW) print("var:") print((var(errors)*(n-1)/n) ) print("(var*(n-1)/n) *n^2 / 2:") print((var(errors)*(n-1)/n) *n ^2 / 2) } print("*************************************") if (SEGMENTWISEMETHOD) return(rbind(ROCpointsSW_X, ROCpointsSW_Y)) # Better, finer calculation if available else return(rbind(ROCpointsPW_X, ROCpointsPW_Y)) # Rough, approximate calculation if the other is not available } #### RROC_convexhull #### # # Returns convexhull of a set of points in RROC space and (optionally) plots it on a previous created RROC space # #### Arguments: # x,y: the arrays for the (x,y) points in RROC space # normalise_axes: divide x-axis and y-axis by N # N: Value used to normalise the axes (if (normalise_axes==FALSE) this value is ignored). Usually, for normalised curves, N has to be set to length(predicted). # plot_hull: If TRUE, also plots the hull # #### Returns # The hull as a bidimensional matrix with the X,Y points of the curve. # RROC_convexhull <- function(x, y, normalise_axes=FALSE, N, plot_hull) { POINTS_ON_THE_HULL <- chull(x,y) # Returns the indices of the points on the convex hull CONVEXHULLX <- x[POINTS_ON_THE_HULL] CONVEXHULLY <- y[POINTS_ON_THE_HULL] if (plot_hull) { if (!normalise_axes) { N <- 1 } lines(CONVEXHULLX/N, CONVEXHULLY/N, col="black", type='o', pch=4) } return(rbind(CONVEXHULLX, CONVEXULLY)) } #### plot_RROCspace #### # # Plots the RROC space # #### Arguments: # normalise_axes: divide x-axis and y-axis by N # N: Value used to normalise the axes (if (normalise_axes==FALSE) this value is ignored). Usually, for normalised curves, N has to be set to length(predicted). # xlim0, ylim0: part of the space shown # DRAW_DIAGONAL: if true draws a diagonal on the RROC space # plot_RROCspace <- function(normalise_axes=FALSE, N, xlim0, ylim0, DRAW_DIAGONAL) { if (!normalise_axes) { N <- 1 xlab <- "OVER" ylab <- "UNDER" } else { xlab <- "OVER / n" ylab <- "UNDER / n" } xlim0 <- xlim0 / N ylim0 <- ylim0 / N OVER=0 UNDER=0 plot(OVER, UNDER, lty="blank", col="white", xlim=c(0,xlim0), ylim=(c(-ylim0,0)), xlab=xlab, ylab=ylab) #xlim # draws diagonal if (DRAW_DIAGONAL) { lines(c(-xlim0*2,xlim0*2), c(ylim0*2,-ylim0*2), col="black", lty="dashed") # 0=blank, 1=solid, 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=twodash } } #### cal_regressor #### # # Calibrates regressor by adding a constant shift to get 0 bias (overpredictions = underpredictions) # #### Arguments: # predicted: an array of predicted values # actual: an array of true values (size of predicted and actual should match) # #### Returns # The predictions shifted to have bias 0 # cal_regressor <- function(predicted, actual) { # shifts model predicted to have 0 bias (overpredictions = underpredictions) orig_error <- predicted - actual predicted <- predicted - sum(orig_error)/length(predicted) predicted } ###################### ####### MAIN ######### ###################### #### Optionally send the plots to a PDF file (if this is enabled they are not shown on the R console) #### # mymar <- c(5, 4, 4, 2) + 0.1 # default # mymar <- c(5, 5, 4, 2) + 0.1 # you can use this to modify the margins of your PDF file # par(mar=mymar) if (PDFOPEN) { pdf(pdfname, height= PDFheight, width= PDFwidth) } #### We define some regression models #### # "actual" is the vector with the true values actual <- c( 0.211, 2.725, 1.933, 3.242, 7.858, 6.061, 7.173, 3.082, 0.894, 1.203) N <- length(actual) # N is used for the size of the dataset # Some regression models (predictions) m1 <- c(-0.082, 3.323, 2.320, 1.080, 7.893, 4.983, 5.121, 3.442, 2.083, 1.112) # m1 in the paper m2 <- c( 0.786, 2.078, 0.587, 1.676, 9.052, 5.875, 6.885, 3.038, 4.097, 0.308) # m2 in the paper (symmetric, i.e., 0 bias) m3 <- c( 1.253, 4.232, 1.734, 5.325, 6.842, 9.325, 8.232, 3.525, 1.352, 1.778) # m3 in the paper m4ties <- c(0.123, -0.779+2, 1.845, 4.573, 8.558, 7.392, 3.669+2, -0.422+2, 0.806, 1.245) # m4 in the paper (has ties) m13mean <- (m1 + m3)/2 # Mean (usually better than m1 or m3 alone) mmean <- rep(mean(actual),10) # The mean of the true values (just to see how it looks) m1cal <- cal_regressor(m1, actual) # shifts m1 to have bias 0 (overpredictions = underpredictions) #### Create the RROC plot (empty for the moment) #### # SIZE OF THE PLOT (IN ABSOLUTE TERMS). Customise these depending on the magnitude of the errors xlim0 <- 20 ylim0 <- 20 if (NEWPLOT) plot_RROCspace(NORMALISE_AXES, length(actual), xlim0, ylim0, FALSE) #### Calculates and plots three curves #### PLOT_POINT <- FALSE PLOT_RECTANGLE <- FALSE PLOT_CURVE <- TRUE CURVE1 <- RROC_curve(m1, actual, expression("m"[1]), PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "red", "red", NULL, normalise_axes=NORMALISE_AXES, N, xlim0, ylim0) # "darkred" "pink" CURVE2 <- RROC_curve(m2, actual, expression("m"[2]), PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "blue", "blue", NULL, normalise_axes=NORMALISE_AXES, N, xlim0, ylim0) # "darkblue" "cyan" CURVE3 <- RROC_curve(m3, actual, expression("m"[3]), PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "green", "green", NULL, normalise_axes=NORMALISE_AXES, N, xlim0, ylim0) # " "darkgreen"greenyellow" #### Joins all their points and calculates and plots their convex hull #### ALLPOINTSX <- c(CURVE1[1,], CURVE2[1,], CURVE3[1,]) ALLPOINTSY <- c(CURVE1[2,], CURVE2[2,], CURVE3[2,]) RROC_convexhull(ALLPOINTSX, ALLPOINTSY, normalise_axes=NORMALISE_AXES, N, TRUE) #### Plotting more models # Remove the comment for plotting these models as well # RROC_curve(m4ties, actual, expression("m"[4]), PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "darkolivegreen", "darkolivegreen", NULL, normalise_axes=NORMALISE_AXES, N, xlim0, ylim0) # "darkred" "pink" # RROC_curve(mmean, actual, NULL, PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "violet", "violet", NULL, normalise_axes=NORMALISE_AXES, N, xlim0, ylim0) # "purple" #### Optionally close the PDF file #### if (PDFCLOSE) { dev.off() } ###################### ### OTHER EXAMPLES ### ###################### #### We define shift functions here and use them for m1. Fig. 12 in the paper #### shiftfunction1 <- function(p, s) { 0.9*p + 0.002*p^3 + s # Good for m1 } shiftfunction2 <- function(p, s) { 1.0*p - 0.004*p^3 + 1.5*s*p^2 + s } # RROC_curve(m1, actual, expression("m"[1]), PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "purple", "purple", shiftfunction1, normalise_axes=NORMALISE_AXES, N, xlim0, ylim0) # "darkred" "pink" # RROC_curve(m1, actual, expression("m"[1]), PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "orange", "orange", shiftfunction2, normalise_axes=NORMALISE_AXES, N, xlim0, ylim0) # "darkred" "pink" #### Here we construct other models by duplicating the evidence #### # m13 <- cbind(m1,m3) / 2 # random choice betwen m1 and m3 # actual13 <- cbind(actual, actual) / 2 # RROC_curve(m13, actual13, NULL, PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "brown4", "brown", NULL, normalise_axes=NORMALISE_AXES, N, xlim0, ylim0) # "orange" #### Figures 9 and 10. Using some examples with more examples, using normal distributions #### MYREP <- 1000 MYSD <- 10/MYREP actualrand <- rnorm(MYREP,0,MYSD) predrand <- rnorm(MYREP,0,MYSD) prednoise <- actualrand + rnorm(MYREP,0,MYSD) # add noise predmean <- rep(mean(actualrand),MYREP) # Fig. 9 # RROC_curve(predrand, actualrand, NULL, PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "violet", "violet", NULL, normalise_axes=NORMALISE_AXES, xlim0, ylim0) # "purple" # RROC_curve(prednoise, actualrand, NULL, PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "orange", "orange", NULL, normalise_axes=NORMALISE_AXES, N, xlim0, ylim0) # "purple" # RROC_curve(predmean, actualrand, NULL, PLOT_POINT, PLOT_RECTANGLE, PLOT_CURVE, "brown", "brown", NULL, normalise_axes=NORMALISE_AXES, N, xlim0, ylim0) # "purple" # Fig 10 ## In this order, to choose the largest first # plot(density(predmean-actualrand), col="brown", main="") # lines(density(prednoise-actualrand), col="orange", main="") # lines(density(predrand-actualrand), col="violet", main="") # A histogram could also be plotted #### hist((predrand-actualrand)) #### This is more material used in the paper for some of the figures #### ## set NORMALISE_AXES== FALSE to get this right ## alpha= 0.8 leads to slope 0.25 and this line for models m1 and m3. DRAW_ALPHA_ISOMETRIC <- FALSE if (DRAW_ALPHA_ISOMETRIC) { intercept <- -3.82275 slope <- 0.25 abline(intercept, slope, col="lightgrey") } ## isometric connecting m1 and m3 DRAW_m13_ISOMETRIC <- FALSE if (DRAW_m13_ISOMETRIC) { intercept <- -7.133684 slope <- 0.5674129 abline(intercept, slope, col="lightgrey") } ## line connecting m1 and m3 DRAW_m13_SEGMENT <- FALSE if (DRAW_m13_SEGMENT) { lines(c(2.569,10.431), c(-5.676,-1.215), lty= convexlty, lwd=2) } ## CONVEX HULL for m1, m2 (discarded) and m3 DRAW_m13_CONVEXHULL <- FALSE if (DRAW_m13_CONVEXHULL) { convexlty= "solid" # "dotted" ## line connecting points m1 and m3 lines(c(2.569,10.431), c(-5.676,-1.215), lty= convexlty, lwd=2) ## vertical segment lines(c(2.569,2.569), c(-ylim0*2, -5.675), lty= convexlty, lwd=2) lines(c(10.431,xlim0*2), c(-1.215,-1.21), lty= convexlty, lwd=2) }