The merged school + geocode file:
View attachment 2042
View attachment 2042
'data.frame': 7345 obs. of 14 variables:
$ adminstrator: chr "ACTING SUPERINTENDENT - DR. RAYMOND COLUCCIELLO" "PRINCIPAL - MR. KEN LEIN" "PRINCIPAL - MS. VIBETTA SANDERS" "PRINCIPAL - MR. THOMAS GIGLIO" ...
$ school : chr "ALBANY CITY SD" "MONTESSORI MAGNET SCHOOL" "PINE HILLS ELEMENTARY SCHOOL" "DELAWARE COMMUNITY SCHOOL" ...
$ beds : chr "010100010000" "010100010014" "010100010016" "010100010018" ...
$ address : chr "ACADEMY PARK" "65 TREMONT ST" "41 N ALLEN ST" "43 BERTHA ST" ...
$ city : chr "ALBANY" "ALBANY" "ALBANY" "ALBANY" ...
$ state : chr "NY" "NY" "NY" "NY" ...
$ zip : chr "122071099" "122061798" "122031601" "122092102" ...
$ area : chr "518" "518" "518" "518" ...
$ phone : chr "4756010" "4756675" "4756725" "4756750" ...
$ record : Factor w/ 5 levels "1","2","3","4",..: 3 1 1 1 1 1 1 1 1 1 ...
$ grade : Factor w/ 9 levels "0","1","2","3",..: NA 2 2 2 2 2 2 2 2 2 ...
$ locations : chr "ACADEMY PARK+ALBANY+NY+122071099" "65 TREMONT ST+ALBANY+NY+122061798" "41 N ALLEN ST+ALBANY+NY+122031601" "43 BERTHA ST+ALBANY+NY+122092102" ...
$ lat : num 42.7 42.7 42.7 42.6 42.7 ...
$ lng : num -73.7 -73.8 -73.8 -73.8 -73.8 ...
# browseURL("http://www.r-chart.com/2010/07/maps-geocoding-and-r-user-conference.html")
# browseURL("https://github.com/hadley/ggplot2/wiki/Crime-in-Downtown-Houston,-Texas-:-Combining-ggplot2-and-Google-Maps")
# browseURL("http://www.nysed.gov/admin/SEDdir.txt")#The Dta File 1
# browseURL("http://www.nysed.gov/admin/LayOut.txt")#The Key
# browseURL("http://www.nysed.gov/admin/LayOut.txt")#Testing data files
# save(dat, file = "dat.RData")
# load("dat.RData")
##########################################################################################
# STEP 1: READ INT THE ADRESSES, BEDS, AND DEOMGRAPHICS OF ALL SCHOOLS IN NEW YORK STATE #
##########################################################################################
path <- "http://www.nysed.gov/admin/SEDdir.txt"
cnames <- c("adminstrator", "school", "beds", "address", "city", "state",
"zip", "area", "phone", "record", "grade")
classes <- c(rep("character", 9), rep("factor", 2))
dat <- read.csv(file = url(path), header = FALSE, strip.white = TRUE, sep= ",",
na.strings= c(" ", ""),stringsAsFactors = FALSE, col.names = cnames,
colClasses = classes)
dat$locations <- as.character(with(dat, paste(address, city, state, zip, sep="+")))
NAer <- function(x) which(is.na(x)) #use to locate missing data
lapply(dat, NAer) #use to locate missing data
dat <- dat[-283, ] #remove observation 283
###########################################################################################
# STEP 2: CONVERT THE ADDRESS TO LAT AND LONGITUDE. #
###########################################################################################
# Many geocoding websites limit you to 2500 requests per day. #
# We will utilize two geo coding websites to work through the #
# problem faster. The two sites we will be utilizing: #
# #
# Google API: #
# browseURL("http://code.google.com/apis/maps/articles/geocodestrat.html") #
# #
# University of Southern California's geocoding web services: #
# browseURL("https://webgis.usc.edu/Services/Geocode/WebService/GeocoderWebService.aspx") #
# NOTE: This site requires you sign up for an use an api.key that you must provide to #
# the csus_geocode function. #
###########################################################################################
# GOOGLE'S API GEOCODING FUNCTION #
###################################
google_geocode <- function(ADDRESS){
require(XML); require(RCurl)
requestXML <- function(address) {
URL <- paste("http://maps.google.com/maps/api/geocode/xml?sensor=false&address=", address, sep = "")
xmlRequest <- getURL(URL)
xml <- xmlToList(xmlRequest)
return(xml)
}
A2 <- gsub("+", ",+", ADDRESS, fixed = TRUE)
A3 <- gsub(" ", "+", A2, fixed = TRUE)
x <- requestXML(A3)
y <- c(lat = as.numeric(x$result$geometry$location$lat), lng = as.numeric(x$result$geometry$location$lng))
return(y)
} #end of function
###########################################################################################
# df <- dat[sample(1:nrow(dat), 20), ] #test it with 20 first
# df <- dat[, ] #Select specific rows (chunks for geocoding)
addresses <- df$locations
latitude <- vector("numeric", length(addresses))
longitude <- vector("numeric", length(addresses))
for (n in seq(addresses)) {
x <- google_geocode(addresses[n])
latitude[n] <- x[1]
longitude[n] <- x[2]
Sys.sleep(0.1) # Suspend requests for a tenth of a second
}
df <- cbind(df, lat = latitude, lng = longitude)
###########################################################################################
# UNIVERSITY OF SOUTHERN CALIFORNIA'S GEOCODING FUNCTION #
##########################################################
csus_geocode <- function(ADDRESS, api.key = "INSERT YOUR OWN API CODE HERE"){
require(XML); require(RCurl)
ADDRESS <- unlist(strsplit(ADDRESS, "+", fixed = TRUE))
geocode_url <- function(address, city, state, zip, api.key){ #function to make urls
address <- gsub(" ", "+", address, fixed = TRUE)
city <- gsub(" ", "+", city, fixed = TRUE)
zip <- substring(zip, 1, 5)
z <- "http://webgis.usc.edu/Services/Geocode/WebService/GeocoderWebServiceHttpNonParsed_V02_96.aspx?"
x <- paste(z, "streetAddress=", address, "&city=", city, "&state=", state, "&zip=",
zip, "&census=false&format=xml&version=2.96&apiKey=", api.key, sep="")
return(x)
} #end of url creation helper function
requestXML <- function(URL) { #address to coordinates function
URL <- URL
xmlRequest <- getURL(URL)
xmlRequest <- substr(xmlRequest, 4, nchar(xmlRequest)) #removes 4 garbage characters at the begining
xml <- xmlToList(xmlRequest)
return(xml)
}#end of address to coordinates function
URL <- geocode_url(address = ADDRESS[1], city = ADDRESS[2], state = ADDRESS[3], zip =ADDRESS[4],
api.key = api.key)
x <- requestXML(URL)
y <- c(lat = as.numeric(x$OutputGeocode$Latitude), lng = as.numeric(x$OutputGeocode$Longitude))
return(y)
} #end of function
#####################################################################################################
# df <- dat[sample(1:nrow(dat), 20), ] #test it with 20 first
# df <- dat[, ] #Select specific rows (chunks for geocoding)
addresses <- df$locations
latitude <- vector("numeric", length(addresses))
longitude <- vector("numeric", length(addresses))
for (n in seq(addresses)) {
x <- csus_geocode(addresses[n])
latitude[n] <- x[1]
longitude[n] <- x[2]
Sys.sleep(0.1) # Suspend requests for a tenth of a second
} # end of function
df <- cbind(df, lat = latitude, lng = longitude)
#####################################################################################################
# GEOCODING A WAITING GAME :) #
########################################################################
df <- dat[1:4500, ] #Select specific rows (chunks for geocoding)
addresses <- df$locations
latitude <- vector("numeric", length(addresses))
longitude <- vector("numeric", length(addresses))
START <- Sys.time()
for (n in seq(addresses)) {
x <- csus_geocode(addresses[n])
latitude[n] <- x[1]
longitude[n] <- x[2]
Sys.sleep(0.1) # Suspend requests for a tenth of a second
} # end of function
df1 <- cbind(df, lat = latitude, lng = longitude)
END <- Sys.time(); END - START #Timing purposes
save(df1, file="PART1.RData")
load("PART1.RData")
##############################################################################################
df <- dat[4501:nrow(dat), ] #Select specific rows (chunks for geocoding)
addresses <- df$locations
latitude <- vector("numeric", length(addresses))
longitude <- vector("numeric", length(addresses))
START <- Sys.time()
for (n in seq(addresses)) {
x <- csus_geocode(addresses[n])
latitude[n] <- x[1]
longitude[n] <- x[2]
Sys.sleep(0.1) # Suspend requests for a tenth of a second
} # end of function
df2 <- cbind(df, lat = latitude, lng = longitude)
END <- Sys.time(); END - START
save(df2, file="PART2.RData")
################################################################################################
# MERGEING THE TWO DATA SETS AND SAVING TO .RDATA #
###################################################
load("PART1.RData"); load("PART2.RData")
geodat <- as.data.frame(rbind(df1, df2))
qview(geodat)
save(geodat, file="geodat.RData")
################################################################################################
# STEP 3 GATHERING MORE DATA (ACADEMIC ACHIEVEMENT) #
#####################################################
load("geodat.RData")
mathURL <- "http://www.p12.nysed.gov/irs/ela-math/2011/2011Grades3-8MathDistrictandBuildingAggregates_Media.xls"
classes <- c(rep("character", 10))
cnames <- c("beds", "name", "subject", "changed", "n.math", "lv1m", "lv2m", "lvl3m", "lvl4m", "mean.m")
library(gdata)
math <- read.xls(xls= mathURL, sheet=1, header=FALSE, strip.white=T, sep=",",as.is=FALSE,
na.strings=c("999","NA"," "), stringsAsFactors = FALSE, , col.names = cnames,
colClasses = classes, skip = 1, method="csv")
elaURL <- "http://www.p12.nysed.gov/irs/ela-math/2011/2011Grades3-8ELADistrictandBuildingAggregates_Media.xls"
classes <- c(rep("character", 10))
cnames <- c("beds", "name", "subject", "changed", "n.ela", "lv1e", "lv2e", "lvl3e", "lvl4e", "mean.e")
ela <- read.xls(xls= elaURL, sheet=1, header=FALSE, strip.white=T, sep=",",as.is=FALSE,
na.strings=c("999","NA"," "), stringsAsFactors = FALSE, , col.names = cnames,
colClasses = classes, skip = 1, method="csv")
save(ela, math, file="ELA_MATH.RData")
######################################################
# STILL SOME CLEANING TO DO BUT THAT'S FOR TOMORROW. #
######################################################
################################################################################################
# STEP 3 GATHERING MORE DATA (ACADEMIC ACHIEVEMENT) #
#####################################################
#############################################
# IMPORT AND CLEAN THE MATH ASSESSMENT DATA #
#############################################
library(gdata)
mathURL <- "http://www.p12.nysed.gov/irs/ela-math/2011/2011Grades3-8MathDistrictandBuildingAggregates_Media.xls"
classes <- c(rep("character", 10))
cnames <- c("beds", "name", "subject", "changed", "n.math", "lv1m", "lv2m", "lvl3m", "lvl4m", "mean.m")
math <- read.xls(xls= mathURL, sheet=1, header=FALSE, strip.white=T, sep=",",as.is=FALSE,
na.strings=c("999","NA"," "), stringsAsFactors = FALSE, col.names = cnames,
colClasses = classes, skip = 1, method="csv")
math[ "subject"] <- as.factor(gsub("Grade ", "", gsub(" Math", "", math[, c("subject")])))
names(math)[3] <- "grade"
math[4] <- NULL #drop the changed column
math[, 4:9] <- lapply(math[, 4:9], as.numeric)
############################################
# IMPORT AND CLEAN THE ELA ASSESSMENT DATA #
############################################
elaURL <- "http://www.p12.nysed.gov/irs/ela-math/2011/2011Grades3-8ELADistrictandBuildingAggregates_Media.xls"
classes <- c(rep("character", 10))
cnames <- c("beds", "name", "subject", "changed", "n.ela", "lv1e", "lv2e", "lvl3e", "lvl4e", "mean.e")
ela <- read.xls(xls= elaURL, sheet=1, header=FALSE, strip.white=T, sep=",",as.is=FALSE,
na.strings=c("999","NA"," "), stringsAsFactors = FALSE, col.names = cnames,
colClasses = classes, skip = 1, method="csv")
ela[ "subject"] <- as.factor(gsub("Grade ", "", gsub(" ELA", "", ela[, c("subject")])))
names(ela)[3] <- "grade"
ela[2] <- NULL; ela[3] <- NULL #drop the school name and changed columns
ela[, 3:8] <- lapply(ela[, 3:8], as.numeric)
assess.dat <- merge(math, ela, by = c("beds", "grade"))
# save(assess.dat, ela, math, file="ELA_MATH.RData")
################################################################################################
# STEP 4 PLOT THE DATA ON A SIMPLE MAP USING: ggplot2, maps, and mapproj #
##########################################################################
unemployment = read.table("http://www.stanford.edu/class/stats202/data/unemployment.csv", header=TRUE, sep='|')
names(unemployment)
library(maps)
library(ggplot2)
county_df = map_data('county')
state_df = map_data('state')
county_df$state = county_df$region
county_df$county = county_df$subregion
unemployment_jul10 = unemployment[unemployment$period=='Jul-10',]
map_data = merge(county_df, unemployment_jul10, by=c('state', 'county'))
map_data = map_data[order(map_data$order),]
map_data$rate_d = cut(map_data$rate, breaks=c(seq(0,10,by=2),35))
(ggplot(map_data, aes(long, lat, group=group)) +
geom_polygon(aes(fill=rate_d), colour=alpha('white', 1/2), size=0.2) +
geom_polygon(data=state_df, colour='white', fill=NA) +
scale_fill_brewer(pal='PuRd'))
state_df$avg_unemployment_jul10 = numeric(nrow(state_df))
sorted_states = sort(unique(map_data$state))
for (state in sorted_states) {
state_subs = unemployment_jul10$state==state
state_df_subs = state_df
state_df$avg_unemployment_jul10[state_df$region == state] = mean(unemployment_jul10$rate[state_subs])
}
state_df$rate_d = cut(state_df$avg_unemployment_jul10, breaks=c(seq(0,10,by=2),35))
(ggplot(state_df, aes(long, lat, group=group)) +
geom_polygon(aes(fill=rate_d), colour=alpha('white', 1/2), size=0.2) +
scale_fill_brewer(pal='PuRd'))
# browsURL("https://gist.github.com/233134")
library(ggplot2)
library(maps)
# First (and most annoying) task - get matching state and county variables
# for both datasets. And unfortauntely it's not quite right, as you can
# see from the finish product - some counties are missing.
path <- "http://datasets.flowingdata.com/unemployment09.csv"
unemp <- read.csv(file=url(path), header = F, stringsAsFactors = F)
names(unemp) <- c("id", "state_fips", "county_fips", "name", "year",
"?", "?", "?", "rate")
unemp$county <- tolower(gsub(" County, [A-Z]{2}", "", unemp$name))
unemp$state <- gsub("^.*([A-Z]{2}).*$", "\\1", unemp$name)
county_df <- map_data("county")
names(county_df) <- c("long", "lat", "group", "order", "state_name", "county")
county_df$state <- state.abb[match(county_df$state_name, tolower(state.name))]
county_df$state_name <- NULL
state_df <- map_data("state")
# Combine together
choropleth <- merge(county_df, unemp, by = c("state", "county"))
choropleth <- choropleth[order(choropleth$order), ]
# Discretise rate to use with Brewer colour scheme - many options here
# choropleth$rate_d <- cut_number(choropleth$rate, 5)
# choropleth$rate_d <- cut_interval(choropleth$rate, 5)
# Nathan's choice is a little odd:
choropleth$rate_d <- cut(choropleth$rate, breaks = c(seq(0, 10, by = 2), 35))
# Once you have the data in the right format, recreating the plot is straight
# forward.
ggplot(choropleth, aes(long, lat, group = group)) +
geom_polygon(aes(fill = rate_d), colour = alpha("white", 1/2), size = 0.2) +
geom_polygon(data = state_df, colour = "white", fill = NA) +
scale_fill_brewer(pal = "PuRd")
# Takes a while to draw because ggplot2 not very efficient with large numbers
# of polygons :(
library(maps)
library(RColorBrewer)
# ========== Obtain 2010 Unemployment Statistics ==========
importBLS <- function(year, ...)
# Accesses county unemployment data from the BLS FTP site
#
# Arguments:
# year - '2-digit' character representation of the desired year. Valid for 1990 to 2010.
# ... - additional arguments passed to read.fwf
#
# Returns:
# A data.frame containing cleaned up BLS data
{
infile <- paste("ftp://ftp.bls.gov/pub/special.requests/la/laucnty", year, ".txt", sep = "")
df <- read.fwf(url(infile), skip = 6, strip.white = TRUE, ...,
col.names = c("series_id", "sFIPS", "cFIPS", "name", "year", "labor", "emp", "unemp", "unrate"),
colClasses = c("factor", rep("character", 3), "factor", rep("character", 3), "numeric"),
widths = c(8, 5, 8, 53, 4, 14, 13, 11, 9))
df <- transform(df,
FIPS = as.numeric(paste(sFIPS, cFIPS, sep = "")),
labor = as.numeric(gsub(",", "", labor)), # Remove formatted strings that include ","
emp = as.numeric(gsub(",", "", emp)),
unemp = as.numeric(gsub(",", "", unemp))
); # end transform
return(df)
} # end function
df <- importBLS("10") # Takes nearly 10 seconds to complete
# ========== Prepare Map Data ==========
data(county.fips)
df$bins <- as.numeric(cut(df$unrate, c(seq(0, 10, 2), 100))) # Classification
pal <- brewer.pal(6, "BuPu") # Color palette
leg.txt <- c("< 2", "2-4", "4-6", "6-8", "8-10", "> 10") # Legend text
matches <- df$bins[match(county.fips$fips, df$FIPS)] # Matched Classes
# ========== Map Data ==========
map("county", col = pal[matches], fill = TRUE, bg = "grey95", resolution = 0, lty = 1)
mtext("Unemployment Rate by County", line = 3, cex = 1.2)
mtext("United States, 2010", line = 2)
legend("bottomright", leg.txt, fill = pal, bty = 'n', cex = 0.8, inset = 0.03)
library(mapproj)
map("county", col = pal[matches], fill = TRUE,
bg = "grey95", projection = "polyconic")
mtext("Unemployment Rate by County", cex = 1.2)
mtext("United States, 2010", line = -1)
legend("bottom", leg.txt, fill = pal, bty = 'n', cex = 0.8, inset = 0.1, horiz = TRUE)
# Legend text and call should probably be separate from this function or passed as additional parameters
mapBLS <- function(data, classes, palette, var, merge) {
data$bins <- classification(data[, var], classes) # Classification
pal <- brewer.pal(classes, palette) # Color palette
matches <- data$bins[match(county.fips$fips, data[, merge])] # Matched Classes
...
} # end function
mapBLS(df, 6, "BuPu", "unrate", "FIPS")
# Prepares classifications for a vector; can be extended to provide other classifications
classification <- function(x, n, method = "quantile") {
cutoffs <- quantile(x, seq(0, 1, length = n), na.rm = TRUE)
cut(x, cutoffs)
} # end function
# ========== Obtain 2010 Unemployment Statistics ==========
importBLS <- function(year, ...)
# Accesses county unemployment data from the BLS FTP site
#
# Arguments:
# year - '2-digit' character representation of the desired year. Valid for 1990 to 2010.
# ... - additional arguments passed to read.fwf
#
# Returns:
# A data.frame containing cleaned up BLS data
{
infile <- paste("ftp://ftp.bls.gov/pub/special.requests/la/laucnty", year, ".txt", sep = "")
df <- read.fwf(url(infile), skip = 6, strip.white = TRUE, ...,
col.names = c("series_id", "sFIPS", "cFIPS", "name", "year", "labor", "emp", "unemp", "unrate"),
colClasses = c("factor", rep("character", 3), "factor", rep("character", 4)),
widths = c(8, 5, 8, 53, 4, 14, 13, 11, 9))
df <- transform(df,
FIPS = as.numeric(paste(sFIPS, cFIPS, sep = "")),
labor = as.numeric(gsub(",", "", labor)), # Remove formatted strings that include ","
emp = as.numeric(gsub(",", "", emp)),
unemp = as.numeric(gsub(",", "", unemp)),
unrate = as.numeric(unrate)
); # end transform
return(df)
} # end function
mapBLS <- function(df, classes, palette, var, link, title, subtitle) {
require(maps)
require(RColorBrewer)
require(mapproj)
# ========== Prepare Map Data ==========
data(county.fips)
df$bins <- classification(df[, var], classes) # Classification
pal <- brewer.pal(classes, palette) # Color palette
matches <- df$bins[match(county.fips$fips, df[, link])] # Matched Classes
# ========== Make Map ==========
map("county", col = pal[as.numeric(matches)], fill = TRUE,
bg = "grey95", projection = "polyconic")
if (!is.null(title)) mtext(title, cex = 1.2)
if (!is.null(subtitle)) mtext(subtitle, line = -1)
legend("bottom", levels(matches), fill = pal, bty = 'n', cex = 0.8, inset = 0.1, horiz = TRUE)
}
dl <- vector('list', 6)
dl[[1]] <- importBLS("05")
dl[[2]] <- importBLS("06")
dl[[3]] <- importBLS("07")
dl[[4]] <- importBLS("08")
dl[[5]] <- importBLS("09")
dl[[6]] <- importBLS("10")
png("UNRATE.png", 1920, 1200)
par(mfrow = c(2, 3))
lapply(seq(6), function(n) {
mapBLS(dl[[n]], 6, "BuPu", "unrate", "FIPS",
title = "Unemployment Rate by County",
subtitle = paste("United States,", switch(n, 2005, 2006, 2007, 2008, 2009, 2010))
) # end mapBLS
return(0)
}) # end lapply
dev.off()
> dl <- vector('list', 6)
> dl[[1]] <- importBLS("05")
Warning messages:
1: In eval(expr, envir, enclos) : NAs introduced by coercion
2: In eval(expr, envir, enclos) : NAs introduced by coercion
3: In eval(expr, envir, enclos) : NAs introduced by coercion
4: In eval(expr, envir, enclos) : NAs introduced by coercion
> dl[[2]] <- importBLS("06")
Warning messages:
1: In eval(expr, envir, enclos) : NAs introduced by coercion
2: In eval(expr, envir, enclos) : NAs introduced by coercion
3: In eval(expr, envir, enclos) : NAs introduced by coercion
4: In eval(expr, envir, enclos) : NAs introduced by coercion