# Copyright (c) 2008 Rick Wash # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # powerlaw.R # # Contains code for the continuous and discrete powerlaw distributions # Also contains a model fit for powerlaw data # Mostly based on a paper from Mark Newman # M.ÊE.ÊJ. Newman. Power laws, pareto distributions and zipfÕs law. Contemporary Physics, 46:323Ð351, 2005. # # dpowerlaw(), ppowerlaw(), qpowerlaw(), and rpowerlaw(): # PDF, CDF, Quantile, and RNG functions for the continuous powerlaw distribution # # ddpowerlaw(), pdpowerlaw(), qdpowerlaw(), and rdpowerlaw(): # PDF, CDF, Quantile, and RNG functions for the discrete powerlaw distribution # # These functions all have the same parameters as the corresponding functions for other # distributions in R, except for the RNG: # # rpowerlaw(n, alpha, xmin=1) # - Generate n random numbers from a continuous powerlaw distribution. # # rdpowerlaw(n, alpha, xmin=1) # - Generate n random numbers from a discrete powerlaw distribution # # Model fit data to a powerlaw distribution: # # m <- plm(data) # - Fit a powerlaw distribution to data. data can be either continuous # or discrete. # # m <- plm(data, xmin=1) # - Fit a powerlaw distribution to data, with xmin pre-specified instead # of being estimated. Much faster. # # print(m) # m # - Print the estimated powerlaw distribution information. # # plot(m, ...) # - Plot a graph of the original data and the fitted powerlaw # distribution. ... are passed as options to xyplot. # # summary(m) # - Run a Kolomogorov-Smirnov goodness-of-fit test for the powerlaw # distribution. Low p values indicate that you can reject the powerlaw # distribution as a possible distribution. Somewhat slow, as the # p-value requires monte-carlo simulations to estimate. # # summary(m, epsilon=0.025) # - Run a KS goodness-of-fit test. Estimate the p-value to # approximately epsilon accuracy. Note that running time of # calculating the p-value is O(epsilon^2) (meaning, don't estimate it # all that accurately. Default is epsilon=0.1 library(gsl) library(numDeriv) # PDF, CDF, Quantile, and RNG functions for continuous power law distribution dpowerlaw <- function(x, alpha=2, xmin=1, log=F) { if (log) log(alpha-1) - log(xmin) - alpha * log(x / xmin) else ((alpha - 1) / xmin) * ((x / xmin) ^ (-alpha)) } ppowerlaw <- function(q, alpha=2, xmin=1, lower.tail=T, log.p = F) { p <- (q / xmin) ^ (- alpha + 1) if (lower.tail) p <- 1-p if (log.p) p <- log(p) p } qpowerlaw <- function(p, alpha=2, xmin=1, lower.tail=T, log.p = F) { if (!lower.tail) p <- 1-p if (log.p) p <- exp(p) xmin * ((1 - p) ^ (-1 / (alpha - 1))) } rpowerlaw <- function(n, alpha=2, xmin=1) { qpowerlaw(runif(n, 0, 1), alpha, xmin) } ddpowerlaw <- function(x, alpha = 2, xmin=1, log=F) { if (log) -log(hzeta(alpha, xmin)) - alpha * log(x) else x ^ (-alpha) / hzeta(alpha, xmin) } # PDF, CDF, Quantile, and RNG functions for discrete power law distributions pdpowerlaw <- function(q, alpha=2, xmin=1, lower.tail=T, log.p=F) { p <- hzeta(alpha, q) / hzeta(alpha, xmin) if (lower.tail) p <- 1-p if (log.p) p <- log(p) p } # Helper function individual_qdpowerlaw <- function(r, alpha, xmin) { x2 <- xmin # Doubling Up repeat { x1 <- x2 x2 <- 2*x1 if (pdpowerlaw(x2, alpha, xmin, lower.tail=F) < 1-r) break } # Binary Search repeat { temp <- (x2 + x1) / 2 if (pdpowerlaw(temp, alpha, xmin, lower.tail=F) < 1-r) { x2 <- temp } else { x1 <- temp } if (ceiling(x2) - floor(x1) < 2) break } floor(x1) } qdpowerlaw <- function(p, alpha=2, xmin=1, approx=F, lower.tail=T, log.p=F) { if (approx) { return(floor(qpowerlaw(p, alpha, xmin - 0.5, lower.tail, log.p) + 0.5)) } if (log.p) p <- exp(p) if (!lower.tail) p <- 1-p sapply(p, function(x) { individual_qdpowerlaw(x, alpha, xmin) }) } rdpowerlaw <- function(n, alpha=2, xmin=1, approx=F) { #if (approx) { #return(floor(rpowerlaw(n, alpha, xmin - 0.5) + 0.5)) #} r <- runif(n, 0, 1) qdpowerlaw(r, alpha, xmin, approx, lower.tail=FALSE, log.p=FALSE) } # Numerically fit a discrete powerlaw distribution via maximum likelihood plm.fit.discrete.numeric <- function(data, xmin=1, alpha.starting = 2) { # nlm the negative log likelihood: n log zeta(a, xmin) + a sum log x_i N <- length(data) #dpowerlaw.nloglik <- function(alpha) { N * log(hzeta(alpha, xmin)) + alpha * sum(log(data)) } dpowerlaw.nloglik <- function(alpha) { -sum(ddpowerlaw(data, alpha, xmin, log=T)) } alpha.info <- nlm(dpowerlaw.nloglik, alpha.starting) alpha <- alpha.info$estimate x2 <- grad(hzeta, alpha, q=xmin) / hzeta(alpha, xmin) x1 <- hessian(hzeta, alpha, q=xmin) / hzeta(alpha, xmin) sigma <- 1 / (N * (x1 - x2^2)) list(alpha = alpha, sigma = sigma, N = N, xmin=xmin, xmin.estimated=F) } # Fit a discrete power law distribution using a continuous approximation plm.fit.discrete.approx <- function(data, xmin = 1, ...) { N <- length(data) x <- sum(log(data / (xmin - 0.5))) alpha <- 1 + N / x sigma <- (alpha - 1) / sqrt(N) list(alpha = alpha, sigma = sigma, N = N, xmin=xmin, xmin.estimated=F) } plm.fit.discrete <- function(data, approx=F, xmin, ...) { data <- data[data >= xmin] if (approx) plm.fit.discrete.approx(data, xmin, ...) else plm.fit.discrete.numeric(data, xmin, ...) } # Fit a continuous power law distribution via maximum likelihood (direct calculation) plm.fit.continuous <- function(data, xmin = 1) { data <- data[data >= xmin] N <- length(data) x <- sum(log(data / xmin)) alpha <- 1 + N / x sigma <- (alpha - 1) / sqrt(N) list(alpha = alpha, sigma = sigma, N = N, xmin=xmin, xmin.estimated=F) } # Helper function to calculate KS statistic ks.plm <- function(data.cdf, x, alpha, xmin, discrete=T) { data.cdf <- data.cdf[x >= xmin] x <- x[x >= xmin] fit.cdf <- if (discrete) pdpowerlaw(x, alpha, xmin, lower.tail=F) else ppowerlaw(x, alpha, xmin, lower.tail=F) KS <- max(abs(data.cdf - fit.cdf)) } # Use search to simultaneously estimate xmin and alpha (discrete) plm.fit.xmin.discrete <- function(data, approx = F, xmin.starting = 1, alpha.starting = 2) { N <- length(data) x <- sort(unique(data)) cdf <- function(n) { sum(data >= n) / N } cdf.data <- sapply(x, cdf) try_xmin <- function(xmin) { if (xmin >= 1) { fit <- plm.fit.discrete(data, approx, xmin=xmin, alpha.starting=alpha.starting) KS <- ks.plm(cdf.data, x, fit$alpha, xmin, discrete=T) } else { fit <- NULL KS <- 2 # A high number so it never gets chosen } list(xmin = xmin, fit=fit, KS=KS) } current <- try_xmin(xmin.starting) left <- try_xmin(current$xmin - 1) right <- try_xmin(current$xmin + 1) repeat { # Test if we found the optimal xmin if ((current$KS <= left$KS) & (current$KS <= right$KS)) break # Test if we have gone too far if (right$xmin > max(data)) break # Move left or right appropriately if ((current$KS - left$KS) > (current$KS - right$KS)) { # Move left one right <- current current <- left left <- try_xmin(current$xmin - 1) } else { # Move right one left <- current current <- right right <- try_xmin(current$xmin + 1) } } list(alpha=current$fit$alpha, sigma=current$fit$sigma, N=current$fit$N, xmin=current$xmin, xmin.estimated=T, KS=current$KS) } # Use search to simultaneously estimate xmin and alpha (continuous) plm.fit.xmin.continuous <- function(data, xmin.starting) { N <- length(data) x <- sort(unique(data)) cdf <- function(n) { sum(data >= n) / N } cdf.data <- sapply(x, cdf) try_xmin <- function(xmin) { fit <- plm.fit.continuous(data, xmin) KS <- ks.plm(cdf.data, x, fit$alpha, xmin, discrete=F) KS } out <- nlm(try_xmin, xmin.starting) fit <- plm.fit.continuous(data, out$estimate) list(alpha = fit$alpha, sigma=fit$sigma, N=fit$N, xmin=fit$xmin, xmin.estimated=T, KS=out$minimum) } # Generic function for a power law model -- Fit data to a power-law distribution plm <- function(data, ...) { UseMethod("plm") } plm.default <- function(data, xmin=NULL, discrete = NULL, approx=FALSE, alpha.starting = 2, xmin.starting=1) { if (is.null(discrete)) { discrete = all(data == as.integer(data)) } if (is.null(xmin)) { # Estimate xmin as well as alpha info <- if (discrete) plm.fit.xmin.discrete(data, approx, xmin.starting=xmin.starting, alpha.starting=alpha.starting) else plm.fit.xmin.continuous(data, xmin.starting=xmin.starting) } else { info <- if (discrete) plm.fit.discrete(data, approx, xmin, alpha.starting=alpha.starting) else plm.fit.continuous(data, xmin) } info$data <- data info$discrete <- discrete info$approx <- approx class(info) <- "plm" info } plm.factor <- function(data, ...) { data <- tapply(data, data, length) NextMethod("plm") # plm.default(counts, discrete=T, ...) } # Helper functions for the "plm" class print.plm <- function(x) { if (x$discrete) { cat("Discrete Power Law Distribution Fit\n") } else { cat("Power Law Distribution Fit\n") } cat(paste(" Exponent:", x$alpha, "+-", x$sigma, "\n")) cat(paste(" Minimum X:", x$xmin, "\n")) if (x$xmin.estimated) cat(paste(" (estimated: KS =", x$KS, ")\n")) cat(paste(" N:", x$N, "\n")) } # Not sure if this works # Usage: # lrtest.plm(m, dexp, 0.125) To test against exponential # lrtest.plm(m, dlnorm, c(0.5, 2)) To test against log-normal lrtest.plm <- function(x, dist_func, start = list()) { if (x$discrete) loglik.powerlaw <- function(alpha) { ddpowerlaw(x$data, alpha, x$xmin, log=T) } else loglik.powerlaw <- function(alpha) { dpowerlaw(x$data, alpha, x$xmin, log=T) } loglik.other <- function(y) { do.call(dist_func, c(list(x=x$data, log=T), y)) } nloglik.other <- function(y) { -sum(loglik.other(y)) } other.estimate <- nlm(nloglik.other, start) #R <- (sum(loglik.powerlaw(x$alpha)) - sum(loglik.other(other.estimate$estimate))) / length(x$data) R <- sum(loglik.powerlaw(x$alpha)) - sum(loglik.other(other.estimate$estimate)) pl.mean <- mean(loglik.powerlaw(x$alpha)) other.mean <- mean(loglik.other(other.estimate$estimate)) sigsq <- sum(((loglik.powerlaw(x$alpha) - loglik.other(other.estimate$estimate)) - (pl.mean - other.mean))^2) / length(x$data) # Copied from help(Normal) erfc <- function(x) 2 * pnorm(x * sqrt(2), lower = FALSE) p <- erfc(R / sqrt(2 * length(x$data) * sigsq)) l <- sum(loglik.powerlaw(x$alpha)) list(R = R, sigsq = sigsq, p=p, fit=other.estimate, loglik=l) } summary.plm <- function(x, num_datasets = NULL, epsilon=0.1) { # Calculate goodness of fit statistic and p-value N <- length(x$data) if (x$xmin.estimated) KS.real <- x$KS else { y <- sort(unique(x$data)) cdf <- function(n) { sum(x$data >= n) / N } cdf.data <- sapply(y, cdf) KS.real <- ks.plm(cdf.data, y, x$alpha, x$xmin, x$discrete) } # Set number of datasets based on intended accuracy of p-value if (is.null(num_datasets)) num_datasets <- 0.25 * epsilon ^ (-2) # Probability of being below xmin data.low <- x$data[x$data < x$xmin] p <- length(data.low) / length(x$data) run_dataset <- function(N) { # Generate data if (length(data.low) > 0) { choices <- rbinom(N, 1, p) N.info <- tapply(rep(1,N), choices, length) N.low <- N.info[2] N.high <- N.info[1] rd.low <- data.low[floor(runif(N.low, 1, length(data.low)+1))] } else { N.high <- N rd.low <- numeric(0) } rd.high <- if (x$discrete) rdpowerlaw(N.high, x$alpha, x$xmin) else rpowerlaw(N.high, x$alpha, x$xmin) # Estimate model and KS statistic rd <- c(rd.low, rd.high) if (x$xmin.estimated) { fit <- plm(rd, xmin=NULL) KS <- fit$KS } else { fit <- plm(rd, xmin=x$xmin, discrete=x$discrete) y <- sort(unique(rd)) cdf <- function(n) { sum(rd >= n) / N } cdf.data <- sapply(y, cdf) KS <- ks.plm(cdf.data, y, x$alpha, x$xmin, x$discrete) } KS } # Get num_datasets KS statistics KS.sim <- sapply(rep(N, num_datasets), run_dataset) p <- sum(KS.sim > KS.real) / length(KS.sim) out <- list(fit = x, KS = KS.real, p = p, sim.results = KS.sim) class(out) <- c("summary.plm", class(out)) out } print.summary.plm <- function(x) { if (x$fit$discrete) { cat("Discrete Power Law Fit") } else { cat("Power Law Fit") } cat(paste(" with alpha =", x$fit$alpha, "+-", x$fit$sigma, "\n")) cat(paste("\nKolmogorov-Smirnov Goodness-of-Fit test\n Null Hypothesis is that the data fits a power-law\n")) cat(paste(" KS =", x$KS, "\n (p-value:", x$p, ")\n")); } # Plots the cumulative distribution function for both the data and the fit plot.plm <- function(x, hypo.col = "red", xlab="Value", ylab="Probability X > x", scales=list(x=list(log=T), y=list(log=T)), ...) { N <- length(x$data) plot_x <- sort(unique(x$data)) cdf <- function(n) { sum(x$data >= n) / N } real_y <- sapply(plot_x, cdf) if (x$discrete) { hypo_y <- pdpowerlaw(plot_x, x$alpha, x$xmin, lower.tail=F) } else { hypo_y <- ppowerlaw(plot_x, x$alpha, x$xmin, lower.tail=F) } df <- data.frame(X = plot_x, Data = real_y, Fit = hypo_y) df <- melt(df, measure.var = c("Data", "Fit")) xyplot(value ~ X, data=df, groups=variable, scales=scales, ylab = ylab, xlab=xlab, ...) }