commit 5d6215744119da697bd24bbaa28ae6d62c6a9e2c
parent 8b1f7d3b4f2f887498ec3ede4bb83636deed8190
Author: Jared Tobin <jared@jtobin.ca>
Date: Mon, 15 Feb 2016 13:01:41 +1300
Cleanup.
Diffstat:
9 files changed, 5 insertions(+), 584 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,8 @@ reference
data
*swp
debug
+*aux
+*log
+*pdf
+deprecated
+.DS_Store
diff --git a/Makefile b/Makefile
@@ -1,27 +0,0 @@
-PROJECT_DIR = $(HOME)/projects/bnp
-
-DATA_DIR = $(PROJECT_DIR)/data
-RAW_DATA_DIR = $(DATA_DIR)/raw
-WORKING_DATA_DIR = $(PROJECT_DIR)/data/working
-INPUT_DATA_DIR = $(PROJECT_DIR)/data/input
-
-SRC_DIR = $(PROJECT_DIR)/src
-CLEAN_DATA_SCRIPT = $(SRC_DIR)/clean_gdp_data.r
-
-# http://data.worldbank.org/indicator/NY.GDP.MKTP.KD.ZG
-RAW_GDP_DATA_URL = 'http://api.worldbank.org/v2/en/indicator/ny.gdp.mktp.kd.zg?downloadformat=csv'
-RAW_GDP_DATA_BASE = 'ny.gdp.mktp.kd.zg_Indicator_en_csv_v2'
-
-$(RAW_DATA_DIR)/$(RAW_GDP_DATA_BASE).zip:
- curl $(RAW_GDP_DATA_URL) > $@
-
-$(WORKING_DATA_DIR)/%.csv: \
- $(RAW_DATA_DIR)/%.zip
- unzip $< $(notdir $@) -d $(WORKING_DATA_DIR)
-
-$(INPUT_DATA_DIR)/%.csv: \
- $(WORKING_DATA_DIR)/%.csv
- $(CLEAN_DATA_SCRIPT)
-
-all: $(INPUT_DATA_DIR)/$(RAW_GDP_DATA_BASE).csv
-
diff --git a/dirichlet-process-mixture/src/Generative.hs b/dirichlet-process-mixture/src/Generative.hs
@@ -1,50 +0,0 @@
-
--- | A generative (prior predictive) finite mixture model.
-
-module Generative where
-
-import Control.Monad (replicateM, zipWithM)
-import System.Random.MWC.Probability
-
--- | Mixing probabilities for the model.
-mixing :: Int -> Prob IO [Double]
-mixing k = do
- a <- inverseGamma 1 1
- symmetricDirichlet k a
-
--- | Mean and precision parameters for clusters in the model.
-locationScale :: Int -> Double -> Double -> Prob IO [(Double, Double)]
-locationScale k muy vary = do
- l <- normal muy (sqrt vary)
- r <- gamma 1 (recip (sqrt vary))
- mus <- replicateM k (normal l (sqrt (recip r)))
- b <- inverseGamma 1 1
- w <- gamma 1 vary
- ss <- replicateM k (gamma b (recip w))
- return $ zip mus ss
-
--- | Finite Gaussian mixture model.
-fmm :: Int -> Double -> Double -> Prob IO [Double]
-fmm k muy vary = do
- (mus, ss) <- fmap unzip (locationScale k muy vary)
- pis <- mixing k
- xs <- zipWithM normal mus (fmap (sqrt . recip) ss)
- return $ zipWith (*) pis xs
-
--- | Conditional mixture model. Sample a set of probabilities from the mixing
--- model and then use those to generate observations in each cluster.
-conditional :: Int -> Int -> Double -> Double -> Prob IO [[Double]]
-conditional n k muy vary = do
- (mus, ss) <- fmap unzip (locationScale k muy vary)
- pis <- mixing k
- replicateM n $ do
- f <- zipWithM normal mus (fmap (sqrt . recip) ss)
- return $ zipWith (*) pis f
-
--- | Sample 5000 times from a conditional mixture model.
-main :: IO ()
-main = do
- samples <- withSystemRandom . asGenIO $ sample (conditional 5000 5 1 1)
- let pretty = putStrLn . filter (`notElem` "[]") . show
- mapM_ pretty samples
-
diff --git a/dirichlet-process-mixture/src/explore_generative.r b/dirichlet-process-mixture/src/explore_generative.r
@@ -1,12 +0,0 @@
-require(ggplot2)
-require(reshape)
-require(dplyr)
-
-d = read.csv('tmp.dat', header = F, colClasses = 'numeric')
-
-melted = melt(d)
-
-mixturePriorPlot =
- ggplot(melted, aes(value, fill = variable, colour = variable)) +
- geom_density(alpha = 0.2)
-
diff --git a/dirichlet-process-mixture/src/fmm.r b/dirichlet-process-mixture/src/fmm.r
@@ -1,289 +0,0 @@
-set.seed(42)
-
-require(actuar)
-require(ars)
-require(gtools)
-
-# finite gaussian mixture model
-#
-# a ~ inverse-gamma(1, 1)
-# p | a ~ symmetric-dirichlet(a)
-# c | p ~ multinomial(p)
-# l ~ gaussian(mu_y, var_y)
-# r ~ gamma(1, prec_y)
-# mu | c, l, r ~ gaussian(l, 1 / r)
-# s | c, b, w ~ gamma(b, 1 / w)
-# b ~ inverse-gamma(1, 1)
-# w ~ gamma(1, var_y)
-# y | p, c, mu, s ~ <p, normal(mu, 1 / s)>
-
-test_data = list(
- rnorm(801, 3.5, 1)
- , rnorm(300, 0.3, 0.8)
- , rnorm(722, -4.2, 0.5)
- )
-
-# observations
-
-dy_prior = function(y, mu, s, p) sum(p * dnorm(y, mu, 1 / s))
-
-ry_prior = function(mu, s, p) {
- c = drop(rmultinom(1, 1, prob = p))
- muc = drop(c %*% mu)
- sc = drop(c %*% s)
- rnorm(1, muc, 1 / sc)
- }
-
-# mu (component means) and parameter models
-
-l_obj_prior = function(y) {
- mu_y = mean(unlist(y))
- var_y = var(unlist(y))
-
- density = function(l) { dnorm(l, mu_y, var_y) }
- sampler = rnorm(1, mu_y, var_y)
- list(density, sampler)
- }
-
-dl_obj_prior = function(l, y) { l_obj_prior(y)[[1]](l) }
-rl_obj_prior = function(y) { l_obj_prior(y)[[2]] }
-
-l_posterior = function(y, mu, r) {
- mu_y = mean(unlist(y))
- var_y = var(unlist(y))
- prec_y = 1 / var_y
- k = length(y)
- m = (mu_y * prec_y + r * sum(mu)) / (prec_y + k * r)
- v = 1 / (prec_y + k * r)
-
- density = function(l) { dnorm(l, m, v) }
- sampler = rnorm(1, m, v)
- list(density, sampler)
- }
-
-dl_posterior = function(l, y, mu, r) { l_posterior(y, mu, r)[[1]](l) }
-rl_posterior = function(y, mu, r) { l_posterior(y, mu, r)[[2]] }
-
-r_obj_prior = function(y) {
- var_y = var(unlist(y))
- prec_y = 1 / var_y
-
- density = function(r) { dgamma(r, 1, prec_y) }
- sampler = rgamma(1, 1, prec_y)
- list(density, sampler)
- }
-
-dr_obj_prior = function(r, y) { r_obj_prior(y)[[1]](r) }
-rr_obj_prior = function(y) { r_obj_prior(y)[[2]] }
-
-r_posterior = function(y, mu, l) {
- var_y = var(unlist(y))
- k = length(y)
- a = k + 1
- b = a / (var_y + sum((mu - l)^2))
-
- density = function(r) { dgamma(r, a, b) }
- sampler = rgamma(1, a, b)
- list(density, sampler)
- }
-
-dr_posterior = function(r, y, mu, l) { r_posterior(y, mu, l)[[1]](r) }
-rr_posterior = function(y, mu, l) { r_posterior(y, mu, l)[[2]] }
-
-dmu_prior = function(mu, l, r) dnorm(mu, l, 1 / r)
-rmu_prior = function(k, l, r) rnorm(k, l, 1 / r)
-
-mu_posterior = function(y, s, l, r) {
- n = unlist(lapply(y, length))
- ybar = unlist(lapply(y, mean))
- m = (ybar * n * s + l * r) / (n * s + r)
- v = 1 / (n * s + r)
-
- density = function(x) { dnorm(x, m, v) }
- sampler = rnorm(length(y), m, v)
- list(density, sampler)
- }
-
-dmu_posterior = function(mu, y, s, l, r) { mu_posterior(y, s, l, r)[[1]](mu) }
-rmu_posterior = function(y, s, l, r) { mu_posterior(y, s, l, r)[[2]] }
-
-# s (component precisions) and parameter models
-
-ds_prior = function(s, b, w) dgamma(s, b, 1 / w)
-rs_prior = function(k, b, w) rgamma(k, b, 1 / w)
-
-s_posterior = function(y, mu, b, w) {
- n = unlist(lapply(y, length))
- squares = unlist(mapply("-", y, as.list(mu))) ^ 2
- a = b + n
- bet = a / (w * b + sum(squares))
-
- density = function(s) { dgamma(s, a, bet) }
- sampler = rgamma(1, a, bet)
- list(density, sampler)
- }
-
-ds_posterior = function(s, y, mu, b, w) { s_posterior(y, mu, b, w)[[1]](s) }
-rs_posterior = function(y, mu, b, w) { s_posterior(y, mu, b, w)[[2]] }
-
-w_obj_prior = function(y) {
- var_y = var(unlist(y))
- density = function(w) { dgamma(w, 1, var_y) }
- sampler = rgamma(1, 1, var_y)
- list(density, sampler)
- }
-
-dw_obj_prior = function(w, y) { w_obj_prior(y)[[1]](w) }
-rw_obj_prior = function(y) { w_obj_prior(y)[[2]] }
-
-w_posterior = function(y, s, b) {
- k = length(y)
- var_y = var(unlist(y))
- prec_y = 1 / var_y
- a = k * b + 1
- bet = a / (prec_y + b * sum(s))
-
- density = function(w) { dgamma(w, a, bet) }
- sampler = dgamma(1, a, bet)
- list(density, sampler)
- }
-
-dw_posterior = function(w, y, s, b) { w_posterior(y, s, b)[[1]](w) }
-rw_posterior = function(y, s, b) { w_posterior(y, s, b)[[2]] }
-
-db_prior = function(b) dinvgamma(b, 1, 1)
-rb_prior = function() rinvgamma(1, 1, 1)
-
-b_posterior = function(y, s, w) {
- k = length(y)
-
- log_density = function(b) {
- -k * lgamma(b / 2) -
- (1 / (2 * b)) +
- ((k * b - 3) / 2) * log(b / 2) +
- b / 2 * sum((log(s * w)) - w * s)
- }
-
- density = function(b) { exp(log_density(b)) }
-
- sampler = function() {
-
- # CHECK do i really need to transform this? is it log-concave already?
- density_xformed = function(b) { b * density(b) }
- log_density_xformed = function(b) { log(density_xformed(b)) }
-
- grad_log_density_xformed = function(b) {
- 1 / b - k * digamma(b / 2) +
- 1 / (2 * b ^ 2) +
- k / 2 * log(b / 2) + (k * b - 3) / b +
- sum((log(s * w)) - w * s) / 2
- }
-
- # error due to non-log-concavity detection
- val = ars(
- n = 1
- , f = log_density_xformed
- , fprima = grad_log_density_xformed
- , x = c(0.01, 0.5, 1)
- , lb = T
- , xlb = 0.01
- , ub = T
- , xub = 10)
-
- exp(sample(val, 1))
- }
-
- list(density, sampler)
- }
-
-db_posterior = function(b, y, s, w) { b_posterior(y, s, w)[[1]](b) }
-rb_posterior = function(y, s, w) { b_posterior(y, s, w)[[2]]() }
-
-# p (mixing probabilities), n (occupation numbers) and parameter models
-
-dp_prior = function(p, a) ddirichlet(p, a)
-rp_prior = function(a) drop(rdirichlet(1, a))
-
-dn_prior = function(n, p) dmultinom(n, prob = p)
-rn_prior = function(p) drop(rmultinom(1, size = 1, prob = p))
-
-da_prior = function(a) dinvgamma(a, 1, 1)
-ra_prior = rinvgamma(1, 1, 1)
-
-a_posterior = function(k, n) {
- log_density = function(a) {
- (k - 3 / 2) * log(a) - 1 / (2 * a) + lgamma(a) - lgamma(n + a)
- }
-
- density = function(a) { exp(log_density(a)) }
-
- # log(a) ~ log-concave and can thus be sampled via ARS
- #
- # note that f(log(a)) = a * f(a), or log(f(log(a))) = log a + log f(a)
- #
- # ARS flakes out when k / n is sufficiently small (i.e. when 'a' would be
- # extremely large). probably rare for this to happen in practice, so 'a' is
- # capped at 20 here
- sampler = function() {
- log_xformed = expression(
- (k - 1 / 2) * log(a) - 1 / (2 * a) + lgamma(a) - lgamma(n + a))
- dens_log_xformed = function(a) eval(log_xformed)
-
- grad_log_xformed = D(log_xformed, 'a')
- grad_dens_log_xformed = function(a) eval(grad_log_xformed)
-
- val = ars(
- n = 1
- , f = dens_log_xformed
- , fprima = grad_dens_log_xformed
- , x = c(0.51, 1, 1.49)
- , lb = T
- , xlb = 0.001
- , ub = T
- , xub = 20.0)
-
- exp(val)
- }
- list(density, sampler)
- }
-
-da_posterior = function(a, k, n) { a_posterior(k, n)[[1]](a) }
-ra_posterior = function(k, n) { a_posterior(k, n)[[2]]() }
-
-# model
-
-model_prior = function(y) {
- k = length(y)
- l = rl_obj_prior(y)
- r = rr_obj_prior(y)
- mu = rmu_prior(k, l, r)
- b = rb_prior()
- w = rw_obj_prior(y)
- s = rs_prior(k, b, w)
- a = ra_prior()
- p = rp_prior(rep(a, k))
-
- sampler = ry_prior(mu, s, p)
- conditional_sampler = function(n) { replicate(n, ry_prior(mu, s, p)()) }
- list(sampler, conditional_sampler)
- }
-
-rmodel_prior = function(y) { model_prior(y)[[1]] }
-
-# conditional model prior; sample a cluster configuration and then sample many
-# observations from that configuration
-
-rmodel_conditional_prior = function(n, y) { model_prior(y)[[2]](n) }
-
-# debug
-
-# y = test_data
-# k = length(y)
-# l = rl_obj_prior(y)
-# r = rr_obj_prior(y)
-# mu = rmu_prior(k, l, r)
-# w = rw_obj_prior(y)
-# b = rb_prior()
-# s = rs_prior(k, b, w)
-
-
diff --git a/dirichlet-process-mixture/src/spiral.r b/dirichlet-process-mixture/src/spiral.r
@@ -1,18 +0,0 @@
-require(MASS)
-require(scatterplot3d)
-
-set.seed(42)
-
-n = 800
-t = sort(runif(n) * 4 * pi)
-
-x = (13 - 0.5 * t) * cos(t)
-y = (13 - 0.5 * t) * sin(t)
-Z = mvrnorm(n, mu = rep(0, 3), Sigma = 0.5 * diag(3))
-
-X = matrix(data = c(x, y, t), nrow = n, ncol = 3) + Z
-
-# visualization
-
-# quartz()
-# scatterplot3d(X, highlight.3d = T, pch = 19)
diff --git a/finite-gaussian-mixture/src/fmm.r b/finite-gaussian-mixture/src/fmm.r
@@ -1,150 +0,0 @@
-set.seed(42)
-
-require(dplyr)
-require(gtools)
-require(magrittr)
-
-mixing_model = function(k, a) drop(rdirichlet(1, (rep(a, k))))
-label_model = function(n, p) drop(rmultinom(1, size = n, prob = p))
-location_model = function(k, l, r) rnorm(k, l, 1 / r)
-precision_model = function(k, b, w) rgamma(k, b, 1 / w)
-
-parameter_model = function(k, n) {
- p = mixing_model(k, 1)
- c = label_model(n, p)
- mu = location_model(k, 0, 0.1)
- s = precision_model(k, 1, 1)
- data.frame(c = c, mu = mu, s = s)
- }
-
-data_model = function(config) {
- sampler = function(row) rnorm(row[1], row[2], 1 / row[3])
- apply(config, MARGIN = 1, sampler)
- }
-
-model = function(k, n) parameter_model(k, n) %>% data_model
-
-# conditional models
-
-conditional_mixing_model = function(n, a0) {
- k = length(n)
- a1 = sapply(n, function(x) { x + a0 / k })
- drop(rdirichlet(1, a1))
- }
-
-# FIXME correctness dubious; easy to get NaN probabilities
-conditional_label_model = function(y, p0, mu, s) {
- k = length(p0)
- reducer = function(p, m, prec) { p * dnorm(y, m, 1 / prec) }
- elements = mapply(reducer, p, m, s)
- p1 = elements / sum(elements)
- sample(1:k, size = 1, prob = p1)
- }
-
-# FIXME correctness dubious; seems incredibly precise no matter what s is
-conditional_location_model = function(y, s, l, r) {
- k = length(s)
- n = sapply(y, length)
- yt = sapply(y, sum)
- m = (yt * s + l * r) / (n * s + r)
- v = 1 / (n * s + r)
- mapply(rnorm, 1, m, v)
- }
-
-# FIXME correctness dubious; precisions are very large
-conditional_precision_model = function(y, mu, b, w) {
- k = length(y)
- c = sapply(y, length)
-
- centered = mapply("-", y, mu)
- squared = lapply(centered, function(x) { x ^ 2 })
- ss = unlist(lapply(squared, sum))
-
- a = b + c
- bet = a / (w * b + ss)
-
- reducer = function(a, b) { rgamma(1, a, b) }
- mapply(reducer, a, bet)
- }
-
-inverse_model = function(n, y) {
- k = 3
- a = 1
- l = 0
- r = 0.1
- b = 1
- w = 1
-
- raw = unlist(y)
-
- kernel = function(p0, mu0, s0) {
- z = sapply(raw, function(x) conditional_label_model(x, p0, mu0, s0)) # FIXME slow
- labelled = data.frame(label = z, y = raw)
- id_query = function(c) filter(labelled, label == c)$y
-
- clustered = sapply(1:k, id_query)
- counts = sapply(clustered, length)
-
- p1 = conditional_mixing_model(counts, a)
- mu1 = conditional_location_model(clustered, s0, l, r)
- s1 = conditional_precision_model(clustered, mu1, b, w)
- list(p = p1, mu = mu1, s = s1)
- }
-
- gibbs = function(epochs, acc, p0, mu0, s0) {
- if (epochs <= 0) {
- return(acc)
- } else {
- result = kernel(p0, mu0, s0)
- # FIXME mapply(rbind, acc, result) ?
- acc$p = rbind(acc$p, result$p)
- acc$mu = rbind(acc$mu, result$mu)
- acc$s = rbind(acc$s, result$s)
- gibbs(epochs - 1, acc, result$p, result$mu, result$s)
- }
- }
-
- p0 = mixing_model(k, a)
- mu0 = location_model(k, l, r)
- s0 = precision_model(k, b, w)
- init = list(p = p0, mu = mu0, s = s0)
-
- gibbs(n, init, p0, mu0, s0)
- }
-
-# utilities
-
-safe_mean = function(x) {
- if (is.null(x) || (length(x) == 0)) {
- return(0)
- } else {
- mean(x)
- }
- }
-
-# debug
-
-test_data = list(
- rnorm(101, 10.5, 1)
- , rnorm(38, 0.3, 1)
- , rnorm(90, -8.2, 0.5)
- )
-
-# y = list(
-# rnorm(801, 3.5, 1)
-# , rnorm(300, 0.3, 0.8)
-# , rnorm(722, -4.2, 0.5)
-# )
-# k = length(y)
-# l = 0
-# r = 0.1
-# mu = location_model(k, l, r)
-# w = 1
-# b = 1
-# s = precision_model(k, b, w)
-# p = mixing_model(k, 1)
-# c = drop(rmultinom(1, 1, prob = p))
-# p0 = p
-# mu0 = mu
-# s0 = s
-
diff --git a/src/clean_gdp_data.r b/src/clean_gdp_data.r
@@ -1,26 +0,0 @@
-#!/usr/bin/Rscript
-
-HOME = Sys.getenv('HOME')
-
-wb_gdp_data_file = 'ny.gdp.mktp.kd.zg_Indicator_en_csv_v2.csv'
-
-data_dir = paste(HOME, 'projects/bnp/data', sep = '/')
-working_data_dir = paste(data_dir, 'working', sep = '/')
-input_data_dir = paste(data_dir, 'input', sep = '/')
-wb_gdp_data = paste(working_data_dir, wb_gdp_data_file, sep = '/')
-
-prune = function(data) {
- pruned = data[,c('Country.Name', 'X2014')]
- names(pruned) = c('country', 'rate')
- completes_only = pruned[complete.cases(pruned),]
- return(completes_only)
- }
-
-d = read.csv(wb_gdp_data, header = T, skip = 4)
-
-write.csv(
- prune(d)
- , paste(input_data_dir, 'gdp.csv', sep = '/')
- , row.names = F
- )
-
diff --git a/src/explore.r b/src/explore.r
@@ -1,12 +0,0 @@
-#!/usr/bin/Rscript
-
-data_dir = paste(HOME, 'projects/bnp/data', sep = '/')
-input_data_dir = paste(data_dir, 'input', sep = '/')
-
-gdp_data = paste(input_data_dir, 'gdp.csv', sep = '/')
-
-d = read.csv(gdp_data, header = T, colClasses = c('factor', 'numeric'))
-
-require(ggplot2)
-
-g = ggplot(d[,2], aes(rate))