bnp

Some older Bayesian nonparametrics research.
Log | Files | Refs | README | LICENSE

commit 5d6215744119da697bd24bbaa28ae6d62c6a9e2c
parent 8b1f7d3b4f2f887498ec3ede4bb83636deed8190
Author: Jared Tobin <jared@jtobin.ca>
Date:   Mon, 15 Feb 2016 13:01:41 +1300

Cleanup.

Diffstat:
M.gitignore | 5+++++
DMakefile | 27---------------------------
Ddirichlet-process-mixture/src/Generative.hs | 50--------------------------------------------------
Ddirichlet-process-mixture/src/explore_generative.r | 12------------
Ddirichlet-process-mixture/src/fmm.r | 289------------------------------------------------------------------------------
Ddirichlet-process-mixture/src/spiral.r | 18------------------
Dfinite-gaussian-mixture/src/fmm.r | 150-------------------------------------------------------------------------------
Dsrc/clean_gdp_data.r | 26--------------------------
Dsrc/explore.r | 12------------
9 files changed, 5 insertions(+), 584 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -2,3 +2,8 @@ reference data *swp debug +*aux +*log +*pdf +deprecated +.DS_Store diff --git a/Makefile b/Makefile @@ -1,27 +0,0 @@ -PROJECT_DIR = $(HOME)/projects/bnp - -DATA_DIR = $(PROJECT_DIR)/data -RAW_DATA_DIR = $(DATA_DIR)/raw -WORKING_DATA_DIR = $(PROJECT_DIR)/data/working -INPUT_DATA_DIR = $(PROJECT_DIR)/data/input - -SRC_DIR = $(PROJECT_DIR)/src -CLEAN_DATA_SCRIPT = $(SRC_DIR)/clean_gdp_data.r - -# http://data.worldbank.org/indicator/NY.GDP.MKTP.KD.ZG -RAW_GDP_DATA_URL = 'http://api.worldbank.org/v2/en/indicator/ny.gdp.mktp.kd.zg?downloadformat=csv' -RAW_GDP_DATA_BASE = 'ny.gdp.mktp.kd.zg_Indicator_en_csv_v2' - -$(RAW_DATA_DIR)/$(RAW_GDP_DATA_BASE).zip: - curl $(RAW_GDP_DATA_URL) > $@ - -$(WORKING_DATA_DIR)/%.csv: \ - $(RAW_DATA_DIR)/%.zip - unzip $< $(notdir $@) -d $(WORKING_DATA_DIR) - -$(INPUT_DATA_DIR)/%.csv: \ - $(WORKING_DATA_DIR)/%.csv - $(CLEAN_DATA_SCRIPT) - -all: $(INPUT_DATA_DIR)/$(RAW_GDP_DATA_BASE).csv - diff --git a/dirichlet-process-mixture/src/Generative.hs b/dirichlet-process-mixture/src/Generative.hs @@ -1,50 +0,0 @@ - --- | A generative (prior predictive) finite mixture model. - -module Generative where - -import Control.Monad (replicateM, zipWithM) -import System.Random.MWC.Probability - --- | Mixing probabilities for the model. -mixing :: Int -> Prob IO [Double] -mixing k = do - a <- inverseGamma 1 1 - symmetricDirichlet k a - --- | Mean and precision parameters for clusters in the model. -locationScale :: Int -> Double -> Double -> Prob IO [(Double, Double)] -locationScale k muy vary = do - l <- normal muy (sqrt vary) - r <- gamma 1 (recip (sqrt vary)) - mus <- replicateM k (normal l (sqrt (recip r))) - b <- inverseGamma 1 1 - w <- gamma 1 vary - ss <- replicateM k (gamma b (recip w)) - return $ zip mus ss - --- | Finite Gaussian mixture model. -fmm :: Int -> Double -> Double -> Prob IO [Double] -fmm k muy vary = do - (mus, ss) <- fmap unzip (locationScale k muy vary) - pis <- mixing k - xs <- zipWithM normal mus (fmap (sqrt . recip) ss) - return $ zipWith (*) pis xs - --- | Conditional mixture model. Sample a set of probabilities from the mixing --- model and then use those to generate observations in each cluster. -conditional :: Int -> Int -> Double -> Double -> Prob IO [[Double]] -conditional n k muy vary = do - (mus, ss) <- fmap unzip (locationScale k muy vary) - pis <- mixing k - replicateM n $ do - f <- zipWithM normal mus (fmap (sqrt . recip) ss) - return $ zipWith (*) pis f - --- | Sample 5000 times from a conditional mixture model. -main :: IO () -main = do - samples <- withSystemRandom . asGenIO $ sample (conditional 5000 5 1 1) - let pretty = putStrLn . filter (`notElem` "[]") . show - mapM_ pretty samples - diff --git a/dirichlet-process-mixture/src/explore_generative.r b/dirichlet-process-mixture/src/explore_generative.r @@ -1,12 +0,0 @@ -require(ggplot2) -require(reshape) -require(dplyr) - -d = read.csv('tmp.dat', header = F, colClasses = 'numeric') - -melted = melt(d) - -mixturePriorPlot = - ggplot(melted, aes(value, fill = variable, colour = variable)) + - geom_density(alpha = 0.2) - diff --git a/dirichlet-process-mixture/src/fmm.r b/dirichlet-process-mixture/src/fmm.r @@ -1,289 +0,0 @@ -set.seed(42) - -require(actuar) -require(ars) -require(gtools) - -# finite gaussian mixture model -# -# a ~ inverse-gamma(1, 1) -# p | a ~ symmetric-dirichlet(a) -# c | p ~ multinomial(p) -# l ~ gaussian(mu_y, var_y) -# r ~ gamma(1, prec_y) -# mu | c, l, r ~ gaussian(l, 1 / r) -# s | c, b, w ~ gamma(b, 1 / w) -# b ~ inverse-gamma(1, 1) -# w ~ gamma(1, var_y) -# y | p, c, mu, s ~ <p, normal(mu, 1 / s)> - -test_data = list( - rnorm(801, 3.5, 1) - , rnorm(300, 0.3, 0.8) - , rnorm(722, -4.2, 0.5) - ) - -# observations - -dy_prior = function(y, mu, s, p) sum(p * dnorm(y, mu, 1 / s)) - -ry_prior = function(mu, s, p) { - c = drop(rmultinom(1, 1, prob = p)) - muc = drop(c %*% mu) - sc = drop(c %*% s) - rnorm(1, muc, 1 / sc) - } - -# mu (component means) and parameter models - -l_obj_prior = function(y) { - mu_y = mean(unlist(y)) - var_y = var(unlist(y)) - - density = function(l) { dnorm(l, mu_y, var_y) } - sampler = rnorm(1, mu_y, var_y) - list(density, sampler) - } - -dl_obj_prior = function(l, y) { l_obj_prior(y)[[1]](l) } -rl_obj_prior = function(y) { l_obj_prior(y)[[2]] } - -l_posterior = function(y, mu, r) { - mu_y = mean(unlist(y)) - var_y = var(unlist(y)) - prec_y = 1 / var_y - k = length(y) - m = (mu_y * prec_y + r * sum(mu)) / (prec_y + k * r) - v = 1 / (prec_y + k * r) - - density = function(l) { dnorm(l, m, v) } - sampler = rnorm(1, m, v) - list(density, sampler) - } - -dl_posterior = function(l, y, mu, r) { l_posterior(y, mu, r)[[1]](l) } -rl_posterior = function(y, mu, r) { l_posterior(y, mu, r)[[2]] } - -r_obj_prior = function(y) { - var_y = var(unlist(y)) - prec_y = 1 / var_y - - density = function(r) { dgamma(r, 1, prec_y) } - sampler = rgamma(1, 1, prec_y) - list(density, sampler) - } - -dr_obj_prior = function(r, y) { r_obj_prior(y)[[1]](r) } -rr_obj_prior = function(y) { r_obj_prior(y)[[2]] } - -r_posterior = function(y, mu, l) { - var_y = var(unlist(y)) - k = length(y) - a = k + 1 - b = a / (var_y + sum((mu - l)^2)) - - density = function(r) { dgamma(r, a, b) } - sampler = rgamma(1, a, b) - list(density, sampler) - } - -dr_posterior = function(r, y, mu, l) { r_posterior(y, mu, l)[[1]](r) } -rr_posterior = function(y, mu, l) { r_posterior(y, mu, l)[[2]] } - -dmu_prior = function(mu, l, r) dnorm(mu, l, 1 / r) -rmu_prior = function(k, l, r) rnorm(k, l, 1 / r) - -mu_posterior = function(y, s, l, r) { - n = unlist(lapply(y, length)) - ybar = unlist(lapply(y, mean)) - m = (ybar * n * s + l * r) / (n * s + r) - v = 1 / (n * s + r) - - density = function(x) { dnorm(x, m, v) } - sampler = rnorm(length(y), m, v) - list(density, sampler) - } - -dmu_posterior = function(mu, y, s, l, r) { mu_posterior(y, s, l, r)[[1]](mu) } -rmu_posterior = function(y, s, l, r) { mu_posterior(y, s, l, r)[[2]] } - -# s (component precisions) and parameter models - -ds_prior = function(s, b, w) dgamma(s, b, 1 / w) -rs_prior = function(k, b, w) rgamma(k, b, 1 / w) - -s_posterior = function(y, mu, b, w) { - n = unlist(lapply(y, length)) - squares = unlist(mapply("-", y, as.list(mu))) ^ 2 - a = b + n - bet = a / (w * b + sum(squares)) - - density = function(s) { dgamma(s, a, bet) } - sampler = rgamma(1, a, bet) - list(density, sampler) - } - -ds_posterior = function(s, y, mu, b, w) { s_posterior(y, mu, b, w)[[1]](s) } -rs_posterior = function(y, mu, b, w) { s_posterior(y, mu, b, w)[[2]] } - -w_obj_prior = function(y) { - var_y = var(unlist(y)) - density = function(w) { dgamma(w, 1, var_y) } - sampler = rgamma(1, 1, var_y) - list(density, sampler) - } - -dw_obj_prior = function(w, y) { w_obj_prior(y)[[1]](w) } -rw_obj_prior = function(y) { w_obj_prior(y)[[2]] } - -w_posterior = function(y, s, b) { - k = length(y) - var_y = var(unlist(y)) - prec_y = 1 / var_y - a = k * b + 1 - bet = a / (prec_y + b * sum(s)) - - density = function(w) { dgamma(w, a, bet) } - sampler = dgamma(1, a, bet) - list(density, sampler) - } - -dw_posterior = function(w, y, s, b) { w_posterior(y, s, b)[[1]](w) } -rw_posterior = function(y, s, b) { w_posterior(y, s, b)[[2]] } - -db_prior = function(b) dinvgamma(b, 1, 1) -rb_prior = function() rinvgamma(1, 1, 1) - -b_posterior = function(y, s, w) { - k = length(y) - - log_density = function(b) { - -k * lgamma(b / 2) - - (1 / (2 * b)) + - ((k * b - 3) / 2) * log(b / 2) + - b / 2 * sum((log(s * w)) - w * s) - } - - density = function(b) { exp(log_density(b)) } - - sampler = function() { - - # CHECK do i really need to transform this? is it log-concave already? - density_xformed = function(b) { b * density(b) } - log_density_xformed = function(b) { log(density_xformed(b)) } - - grad_log_density_xformed = function(b) { - 1 / b - k * digamma(b / 2) + - 1 / (2 * b ^ 2) + - k / 2 * log(b / 2) + (k * b - 3) / b + - sum((log(s * w)) - w * s) / 2 - } - - # error due to non-log-concavity detection - val = ars( - n = 1 - , f = log_density_xformed - , fprima = grad_log_density_xformed - , x = c(0.01, 0.5, 1) - , lb = T - , xlb = 0.01 - , ub = T - , xub = 10) - - exp(sample(val, 1)) - } - - list(density, sampler) - } - -db_posterior = function(b, y, s, w) { b_posterior(y, s, w)[[1]](b) } -rb_posterior = function(y, s, w) { b_posterior(y, s, w)[[2]]() } - -# p (mixing probabilities), n (occupation numbers) and parameter models - -dp_prior = function(p, a) ddirichlet(p, a) -rp_prior = function(a) drop(rdirichlet(1, a)) - -dn_prior = function(n, p) dmultinom(n, prob = p) -rn_prior = function(p) drop(rmultinom(1, size = 1, prob = p)) - -da_prior = function(a) dinvgamma(a, 1, 1) -ra_prior = rinvgamma(1, 1, 1) - -a_posterior = function(k, n) { - log_density = function(a) { - (k - 3 / 2) * log(a) - 1 / (2 * a) + lgamma(a) - lgamma(n + a) - } - - density = function(a) { exp(log_density(a)) } - - # log(a) ~ log-concave and can thus be sampled via ARS - # - # note that f(log(a)) = a * f(a), or log(f(log(a))) = log a + log f(a) - # - # ARS flakes out when k / n is sufficiently small (i.e. when 'a' would be - # extremely large). probably rare for this to happen in practice, so 'a' is - # capped at 20 here - sampler = function() { - log_xformed = expression( - (k - 1 / 2) * log(a) - 1 / (2 * a) + lgamma(a) - lgamma(n + a)) - dens_log_xformed = function(a) eval(log_xformed) - - grad_log_xformed = D(log_xformed, 'a') - grad_dens_log_xformed = function(a) eval(grad_log_xformed) - - val = ars( - n = 1 - , f = dens_log_xformed - , fprima = grad_dens_log_xformed - , x = c(0.51, 1, 1.49) - , lb = T - , xlb = 0.001 - , ub = T - , xub = 20.0) - - exp(val) - } - list(density, sampler) - } - -da_posterior = function(a, k, n) { a_posterior(k, n)[[1]](a) } -ra_posterior = function(k, n) { a_posterior(k, n)[[2]]() } - -# model - -model_prior = function(y) { - k = length(y) - l = rl_obj_prior(y) - r = rr_obj_prior(y) - mu = rmu_prior(k, l, r) - b = rb_prior() - w = rw_obj_prior(y) - s = rs_prior(k, b, w) - a = ra_prior() - p = rp_prior(rep(a, k)) - - sampler = ry_prior(mu, s, p) - conditional_sampler = function(n) { replicate(n, ry_prior(mu, s, p)()) } - list(sampler, conditional_sampler) - } - -rmodel_prior = function(y) { model_prior(y)[[1]] } - -# conditional model prior; sample a cluster configuration and then sample many -# observations from that configuration - -rmodel_conditional_prior = function(n, y) { model_prior(y)[[2]](n) } - -# debug - -# y = test_data -# k = length(y) -# l = rl_obj_prior(y) -# r = rr_obj_prior(y) -# mu = rmu_prior(k, l, r) -# w = rw_obj_prior(y) -# b = rb_prior() -# s = rs_prior(k, b, w) - - diff --git a/dirichlet-process-mixture/src/spiral.r b/dirichlet-process-mixture/src/spiral.r @@ -1,18 +0,0 @@ -require(MASS) -require(scatterplot3d) - -set.seed(42) - -n = 800 -t = sort(runif(n) * 4 * pi) - -x = (13 - 0.5 * t) * cos(t) -y = (13 - 0.5 * t) * sin(t) -Z = mvrnorm(n, mu = rep(0, 3), Sigma = 0.5 * diag(3)) - -X = matrix(data = c(x, y, t), nrow = n, ncol = 3) + Z - -# visualization - -# quartz() -# scatterplot3d(X, highlight.3d = T, pch = 19) diff --git a/finite-gaussian-mixture/src/fmm.r b/finite-gaussian-mixture/src/fmm.r @@ -1,150 +0,0 @@ -set.seed(42) - -require(dplyr) -require(gtools) -require(magrittr) - -mixing_model = function(k, a) drop(rdirichlet(1, (rep(a, k)))) -label_model = function(n, p) drop(rmultinom(1, size = n, prob = p)) -location_model = function(k, l, r) rnorm(k, l, 1 / r) -precision_model = function(k, b, w) rgamma(k, b, 1 / w) - -parameter_model = function(k, n) { - p = mixing_model(k, 1) - c = label_model(n, p) - mu = location_model(k, 0, 0.1) - s = precision_model(k, 1, 1) - data.frame(c = c, mu = mu, s = s) - } - -data_model = function(config) { - sampler = function(row) rnorm(row[1], row[2], 1 / row[3]) - apply(config, MARGIN = 1, sampler) - } - -model = function(k, n) parameter_model(k, n) %>% data_model - -# conditional models - -conditional_mixing_model = function(n, a0) { - k = length(n) - a1 = sapply(n, function(x) { x + a0 / k }) - drop(rdirichlet(1, a1)) - } - -# FIXME correctness dubious; easy to get NaN probabilities -conditional_label_model = function(y, p0, mu, s) { - k = length(p0) - reducer = function(p, m, prec) { p * dnorm(y, m, 1 / prec) } - elements = mapply(reducer, p, m, s) - p1 = elements / sum(elements) - sample(1:k, size = 1, prob = p1) - } - -# FIXME correctness dubious; seems incredibly precise no matter what s is -conditional_location_model = function(y, s, l, r) { - k = length(s) - n = sapply(y, length) - yt = sapply(y, sum) - m = (yt * s + l * r) / (n * s + r) - v = 1 / (n * s + r) - mapply(rnorm, 1, m, v) - } - -# FIXME correctness dubious; precisions are very large -conditional_precision_model = function(y, mu, b, w) { - k = length(y) - c = sapply(y, length) - - centered = mapply("-", y, mu) - squared = lapply(centered, function(x) { x ^ 2 }) - ss = unlist(lapply(squared, sum)) - - a = b + c - bet = a / (w * b + ss) - - reducer = function(a, b) { rgamma(1, a, b) } - mapply(reducer, a, bet) - } - -inverse_model = function(n, y) { - k = 3 - a = 1 - l = 0 - r = 0.1 - b = 1 - w = 1 - - raw = unlist(y) - - kernel = function(p0, mu0, s0) { - z = sapply(raw, function(x) conditional_label_model(x, p0, mu0, s0)) # FIXME slow - labelled = data.frame(label = z, y = raw) - id_query = function(c) filter(labelled, label == c)$y - - clustered = sapply(1:k, id_query) - counts = sapply(clustered, length) - - p1 = conditional_mixing_model(counts, a) - mu1 = conditional_location_model(clustered, s0, l, r) - s1 = conditional_precision_model(clustered, mu1, b, w) - list(p = p1, mu = mu1, s = s1) - } - - gibbs = function(epochs, acc, p0, mu0, s0) { - if (epochs <= 0) { - return(acc) - } else { - result = kernel(p0, mu0, s0) - # FIXME mapply(rbind, acc, result) ? - acc$p = rbind(acc$p, result$p) - acc$mu = rbind(acc$mu, result$mu) - acc$s = rbind(acc$s, result$s) - gibbs(epochs - 1, acc, result$p, result$mu, result$s) - } - } - - p0 = mixing_model(k, a) - mu0 = location_model(k, l, r) - s0 = precision_model(k, b, w) - init = list(p = p0, mu = mu0, s = s0) - - gibbs(n, init, p0, mu0, s0) - } - -# utilities - -safe_mean = function(x) { - if (is.null(x) || (length(x) == 0)) { - return(0) - } else { - mean(x) - } - } - -# debug - -test_data = list( - rnorm(101, 10.5, 1) - , rnorm(38, 0.3, 1) - , rnorm(90, -8.2, 0.5) - ) - -# y = list( -# rnorm(801, 3.5, 1) -# , rnorm(300, 0.3, 0.8) -# , rnorm(722, -4.2, 0.5) -# ) -# k = length(y) -# l = 0 -# r = 0.1 -# mu = location_model(k, l, r) -# w = 1 -# b = 1 -# s = precision_model(k, b, w) -# p = mixing_model(k, 1) -# c = drop(rmultinom(1, 1, prob = p)) -# p0 = p -# mu0 = mu -# s0 = s - diff --git a/src/clean_gdp_data.r b/src/clean_gdp_data.r @@ -1,26 +0,0 @@ -#!/usr/bin/Rscript - -HOME = Sys.getenv('HOME') - -wb_gdp_data_file = 'ny.gdp.mktp.kd.zg_Indicator_en_csv_v2.csv' - -data_dir = paste(HOME, 'projects/bnp/data', sep = '/') -working_data_dir = paste(data_dir, 'working', sep = '/') -input_data_dir = paste(data_dir, 'input', sep = '/') -wb_gdp_data = paste(working_data_dir, wb_gdp_data_file, sep = '/') - -prune = function(data) { - pruned = data[,c('Country.Name', 'X2014')] - names(pruned) = c('country', 'rate') - completes_only = pruned[complete.cases(pruned),] - return(completes_only) - } - -d = read.csv(wb_gdp_data, header = T, skip = 4) - -write.csv( - prune(d) - , paste(input_data_dir, 'gdp.csv', sep = '/') - , row.names = F - ) - diff --git a/src/explore.r b/src/explore.r @@ -1,12 +0,0 @@ -#!/usr/bin/Rscript - -data_dir = paste(HOME, 'projects/bnp/data', sep = '/') -input_data_dir = paste(data_dir, 'input', sep = '/') - -gdp_data = paste(input_data_dir, 'gdp.csv', sep = '/') - -d = read.csv(gdp_data, header = T, colClasses = c('factor', 'numeric')) - -require(ggplot2) - -g = ggplot(d[,2], aes(rate))