commit b7d1454a2be517ecbfd79b46dd580c4359824796
parent 1f092dc778b51514b0337b654cf569382e5e73bb
Author: Jared Tobin <jared@jtobin.ca>
Date: Mon, 14 Dec 2015 01:27:01 -0500
Misc.
Diffstat:
4 files changed, 57 insertions(+), 25 deletions(-)
diff --git a/Makefile b/Makefile
@@ -1,17 +1,27 @@
PROJECT_DIR = $(HOME)/projects/bnp
-DATA_DIR = $(PROJECT_DIR)/data/
-INPUT_DATA_DIR = $(PROJECT_DIR)/data/input
-RAW_DATA_DIR = $(DATA_DIR)/raw
+
+DATA_DIR = $(PROJECT_DIR)/data
+RAW_DATA_DIR = $(DATA_DIR)/raw
+WORKING_DATA_DIR = $(PROJECT_DIR)/data/working
+INPUT_DATA_DIR = $(PROJECT_DIR)/data/input
+
+SRC_DIR = $(PROJECT_DIR)/src
+CLEAN_DATA_SCRIPT = $(SRC_DIR)/clean_gdp_data.r
# http://data.worldbank.org/indicator/NY.GDP.MKTP.KD.ZG
RAW_GDP_DATA_URL = 'http://api.worldbank.org/v2/en/indicator/ny.gdp.mktp.kd.zg?downloadformat=csv'
+RAW_GDP_DATA_BASE = 'ny.gdp.mktp.kd.zg_Indicator_en_csv_v2'
-# get raw data
-$(RAW_DATA_DIR)/ny.gdp.mktp.kd.zg_Indicator_en_csv_v2.zip:
+$(RAW_DATA_DIR)/$(RAW_GDP_DATA_BASE).zip:
curl $(RAW_GDP_DATA_URL) > $@
-# clean raw data (FIXME do more here)
-ny.gdp.mktp.kd.zg_Indicator_en_csv_v2.csv: \
- $(RAW_GDP_DATA)
- unzip $< $@ && mv $@ $(INPUT_DATA_DIR)
+$(WORKING_DATA_DIR)/%.csv: \
+ $(RAW_DATA_DIR)/%.zip
+ unzip $< $(notdir $@) -d $(WORKING_DATA_DIR)
+
+$(INPUT_DATA_DIR)/%.csv: \
+ $(WORKING_DATA_DIR)/%.csv
+ $(CLEAN_DATA_SCRIPT)
+
+all: $(INPUT_DATA_DIR)/$(RAW_GDP_DATA_BASE).csv
diff --git a/src/clean_gdp_data.r b/src/clean_gdp_data.r
@@ -0,0 +1,26 @@
+#!/usr/bin/Rscript
+
+HOME = Sys.getenv('HOME')
+
+wb_gdp_data_file = 'ny.gdp.mktp.kd.zg_Indicator_en_csv_v2.csv'
+
+data_dir = paste(HOME, 'projects/bnp/data', sep = '/')
+working_data_dir = paste(data_dir, 'working', sep = '/')
+input_data_dir = paste(data_dir, 'input', sep = '/')
+wb_gdp_data = paste(working_data_dir, wb_gdp_data_file, sep = '/')
+
+prune = function(data) {
+ pruned = data[,c('Country.Name', 'X2014')]
+ names(pruned) = c('country', 'rate')
+ completes_only = pruned[complete.cases(pruned),]
+ return(completes_only)
+ }
+
+d = read.csv(wb_gdp_data, header = T, skip = 4)
+
+write.csv(
+ prune(d)
+ , paste(input_data_dir, 'gdp.csv', sep = '/')
+ , row.names = F
+ )
+
diff --git a/src/explore.r b/src/explore.r
@@ -0,0 +1,12 @@
+#!/usr/bin/Rscript
+
+data_dir = paste(HOME, 'projects/bnp/data', sep = '/')
+input_data_dir = paste(data_dir, 'input', sep = '/')
+
+gdp_data = paste(input_data_dir, 'gdp.csv', sep = '/')
+
+d = read.csv(gdp_data, header = T, colClasses = c('factor', 'numeric'))
+
+require(ggplot2)
+
+g = ggplot(d[,2], aes(rate))
diff --git a/src/gdp.r b/src/gdp.r
@@ -1,16 +0,0 @@
-project_dir = '/Users/jtobin/projects/bnp/'
-data_dir = paste(project_dir, 'data/input', sep = '/')
-
-wb_gdp_data = paste(
- data_dir
- , 'ny.gdp.mktp.kd.zg_Indicator_en_csv_v2.csv'
- , sep = '/'
- )
-
-d = read.csv(
- wb_gdp_data
- , header = T
- , skip = 4
- )
-
-