bnp

Some older Bayesian nonparametrics research.
Log | Files | Refs | README | LICENSE

commit b7d1454a2be517ecbfd79b46dd580c4359824796
parent 1f092dc778b51514b0337b654cf569382e5e73bb
Author: Jared Tobin <jared@jtobin.ca>
Date:   Mon, 14 Dec 2015 01:27:01 -0500

Misc.

Diffstat:
MMakefile | 28+++++++++++++++++++---------
Asrc/clean_gdp_data.r | 26++++++++++++++++++++++++++
Asrc/explore.r | 12++++++++++++
Dsrc/gdp.r | 16----------------
4 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/Makefile b/Makefile @@ -1,17 +1,27 @@ PROJECT_DIR = $(HOME)/projects/bnp -DATA_DIR = $(PROJECT_DIR)/data/ -INPUT_DATA_DIR = $(PROJECT_DIR)/data/input -RAW_DATA_DIR = $(DATA_DIR)/raw + +DATA_DIR = $(PROJECT_DIR)/data +RAW_DATA_DIR = $(DATA_DIR)/raw +WORKING_DATA_DIR = $(PROJECT_DIR)/data/working +INPUT_DATA_DIR = $(PROJECT_DIR)/data/input + +SRC_DIR = $(PROJECT_DIR)/src +CLEAN_DATA_SCRIPT = $(SRC_DIR)/clean_gdp_data.r # http://data.worldbank.org/indicator/NY.GDP.MKTP.KD.ZG RAW_GDP_DATA_URL = 'http://api.worldbank.org/v2/en/indicator/ny.gdp.mktp.kd.zg?downloadformat=csv' +RAW_GDP_DATA_BASE = 'ny.gdp.mktp.kd.zg_Indicator_en_csv_v2' -# get raw data -$(RAW_DATA_DIR)/ny.gdp.mktp.kd.zg_Indicator_en_csv_v2.zip: +$(RAW_DATA_DIR)/$(RAW_GDP_DATA_BASE).zip: curl $(RAW_GDP_DATA_URL) > $@ -# clean raw data (FIXME do more here) -ny.gdp.mktp.kd.zg_Indicator_en_csv_v2.csv: \ - $(RAW_GDP_DATA) - unzip $< $@ && mv $@ $(INPUT_DATA_DIR) +$(WORKING_DATA_DIR)/%.csv: \ + $(RAW_DATA_DIR)/%.zip + unzip $< $(notdir $@) -d $(WORKING_DATA_DIR) + +$(INPUT_DATA_DIR)/%.csv: \ + $(WORKING_DATA_DIR)/%.csv + $(CLEAN_DATA_SCRIPT) + +all: $(INPUT_DATA_DIR)/$(RAW_GDP_DATA_BASE).csv diff --git a/src/clean_gdp_data.r b/src/clean_gdp_data.r @@ -0,0 +1,26 @@ +#!/usr/bin/Rscript + +HOME = Sys.getenv('HOME') + +wb_gdp_data_file = 'ny.gdp.mktp.kd.zg_Indicator_en_csv_v2.csv' + +data_dir = paste(HOME, 'projects/bnp/data', sep = '/') +working_data_dir = paste(data_dir, 'working', sep = '/') +input_data_dir = paste(data_dir, 'input', sep = '/') +wb_gdp_data = paste(working_data_dir, wb_gdp_data_file, sep = '/') + +prune = function(data) { + pruned = data[,c('Country.Name', 'X2014')] + names(pruned) = c('country', 'rate') + completes_only = pruned[complete.cases(pruned),] + return(completes_only) + } + +d = read.csv(wb_gdp_data, header = T, skip = 4) + +write.csv( + prune(d) + , paste(input_data_dir, 'gdp.csv', sep = '/') + , row.names = F + ) + diff --git a/src/explore.r b/src/explore.r @@ -0,0 +1,12 @@ +#!/usr/bin/Rscript + +data_dir = paste(HOME, 'projects/bnp/data', sep = '/') +input_data_dir = paste(data_dir, 'input', sep = '/') + +gdp_data = paste(input_data_dir, 'gdp.csv', sep = '/') + +d = read.csv(gdp_data, header = T, colClasses = c('factor', 'numeric')) + +require(ggplot2) + +g = ggplot(d[,2], aes(rate)) diff --git a/src/gdp.r b/src/gdp.r @@ -1,16 +0,0 @@ -project_dir = '/Users/jtobin/projects/bnp/' -data_dir = paste(project_dir, 'data/input', sep = '/') - -wb_gdp_data = paste( - data_dir - , 'ny.gdp.mktp.kd.zg_Indicator_en_csv_v2.csv' - , sep = '/' - ) - -d = read.csv( - wb_gdp_data - , header = T - , skip = 4 - ) - -