Return to Question

Notice removed Draw attention by Community Bot

occurred Nov 25, 2015 at 4:11

Bounty Ended with no winning answer by Community Bot

occurred Nov 25, 2015 at 4:11

Updated flow of data as requested

Source Link

edited Nov 17, 2015 at 21:52

Amstell

edited Nov 17, 2015 at 21:52

Amstell

The flow of the data looks like this:

Load individual station id (newdat-lists all dates, and 5 closest stations (s1-s5)
For an individual station check the value variable for each day for conditions described above (e.g. outlier, NA, or -9999)
When condition is met, grab the location in the data frame of element, year, month, and day from newdat.
Subset s1-s5 based on those criteria (this will leave 1 or 0 obs for each s1-s5
Stack s1-s5 and use either idw if greater than 2 obs exist, use single station if only 1 obs exists, or mean of the month and year if no obs exist
If using idw, it is necessary to use the lat/long for station and (s1-s5).
Use this value to plug in to newdat where the value was found to be an outlier, NA, or -9999.
Move on to next day

newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632"), element = structure(c(1L, 2L, 
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"), 
 year = c("1900", "1900", "1900", "1900", "1900", "1900", 
 "1900", "1900", "1900", "1900"), month = c("01", "01", "02", 
 "02", "03", "04", "04", "05", "05", "01"), day = c("01", 
 "01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567, 
 -25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447, 
 -25566), class = "Date"), value = c(30.02, 28, 37.94, 10.94, 
 28.04200, 25, 41, 82.04, 51.08, NA)), .Names = c("id", "element", 
"year", "month", "day", "date", "value"), row.names = c(NA, 10L
), class = "data.frame")
s1 <- structure(list(id = "USC00031152", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 37.04, y = 33.59, x = -92.8236), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s2 <- structure(list(id = "USC00034638", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 62.06, y = 34.7392, x = -90.7664), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s3 <- structure(list(id = "USC00036352", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 30.92, y = 35.2833, x = -93.1), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s4 <- structure(list(id = "USC00036248", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 30.02, y = 36.3667, x = -94.1), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s5 <- structure(list(id = "USC00035112", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = -9999, y = 33.9294, x = -93.8583), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
station <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4), class = "data.frame", row.names = c(NA, 
-1L), .Names = c("id", "lat", "long"))

newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632"), element = structure(c(1L, 2L, 
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"), 
 year = c("1900", "1900", "1900", "1900", "1900", "1900", 
 "1900", "1900", "1900", "1900"), month = c("01", "01", "02", 
 "02", "03", "04", "04", "05", "05", "01"), day = c("01", 
 "01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567, 
 -25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447, 
 -25566), class = "Date"), value = c(30.02, 28, 37.94, 10.94, 
 28.04, 25, 41, 82.04, 51.08, NA)), .Names = c("id", "element", 
"year", "month", "day", "date", "value"), row.names = c(NA, 10L
), class = "data.frame")
s1 <- structure(list(id = "USC00031152", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 37.04, y = 33.59, x = -92.8236), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s2 <- structure(list(id = "USC00034638", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 62.06, y = 34.7392, x = -90.7664), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s3 <- structure(list(id = "USC00036352", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 30.92, y = 35.2833, x = -93.1), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s4 <- structure(list(id = "USC00036248", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 30.02, y = 36.3667, x = -94.1), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s5 <- structure(list(id = "USC00035112", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = -9999, y = 33.9294, x = -93.8583), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
station <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4), class = "data.frame", row.names = c(NA, 
-1L), .Names = c("id", "lat", "long"))

The flow of the data looks like this:

Load individual station id (newdat-lists all dates, and 5 closest stations (s1-s5)
For an individual station check the value variable for each day for conditions described above (e.g. outlier, NA, or -9999)
When condition is met, grab the location in the data frame of element, year, month, and day from newdat.
Subset s1-s5 based on those criteria (this will leave 1 or 0 obs for each s1-s5
Stack s1-s5 and use either idw if greater than 2 obs exist, use single station if only 1 obs exists, or mean of the month and year if no obs exist
If using idw, it is necessary to use the lat/long for station and (s1-s5).
Use this value to plug in to newdat where the value was found to be an outlier, NA, or -9999.
Move on to next day

newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632"), element = structure(c(1L, 2L, 
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"), 
 year = c("1900", "1900", "1900", "1900", "1900", "1900", 
 "1900", "1900", "1900", "1900"), month = c("01", "01", "02", 
 "02", "03", "04", "04", "05", "05", "01"), day = c("01", 
 "01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567, 
 -25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447, 
 -25566), class = "Date"), value = c(30.02, 28, 37.94, 10.94, 
 200, 25, 41, 82.04, 51.08, NA)), .Names = c("id", "element", 
"year", "month", "day", "date", "value"), row.names = c(NA, 10L
), class = "data.frame")
s1 <- structure(list(id = "USC00031152", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 37.04, y = 33.59, x = -92.8236), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s2 <- structure(list(id = "USC00034638", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 62.06, y = 34.7392, x = -90.7664), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s3 <- structure(list(id = "USC00036352", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 30.92, y = 35.2833, x = -93.1), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s4 <- structure(list(id = "USC00036248", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = 30.02, y = 36.3667, x = -94.1), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
s5 <- structure(list(id = "USC00035112", element = "TMAX", year = 1900, 
 month = 1, day = 2, date = structure(-25566, class = "Date"), 
 value = -9999, y = 33.9294, x = -93.8583), row.names = c(NA, 
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id", 
"element", "year", "month", "day", "date", "value", "y", "x"))
station <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4), class = "data.frame", row.names = c(NA, 
-1L), .Names = c("id", "lat", "long"))

added 15 characters in body

Source Link

edited Nov 17, 2015 at 5:46

Amstell

edited Nov 17, 2015 at 5:46

Amstell

Problem

deleted 4 characters in body

Source Link

edited Nov 17, 2015 at 4:55

Amstell

edited Nov 17, 2015 at 4:55

Amstell

The main idea is to build some fine scale weather data using inverse distance weighting (idw) from the 5 closest stations of the observed station. If the value in the newdat data set is greater than 134, less than -80, equal to -9999, or NA then run the function to get an interpolated value. If greater than 2 stations exist usinguse idw, use the closest station if only 1 station exists use the closest station, or if no close stations exist use the mean for the month of the year if no close stations exist.

The main idea is to build some fine scale weather data using inverse distance weighting (idw) from the 5 closest stations of the observed station. If the value in the newdat data set is greater than 134, less than -80, equal to -9999, or NA then run the function to get an interpolated value. If greater than 2 stations exist using idw, use the closest station if only 1 station exists, or use the mean for the month of the year if no close stations exist.

The main idea is to build some fine scale weather data using inverse distance weighting (idw) from the 5 closest stations of the observed station. If the value in the newdat data set is greater than 134, less than -80, equal to -9999, or NA then run the function to get an interpolated value. If greater than 2 stations exist use idw, if only 1 station exists use the closest station, or if no close stations exist use the mean for the month of the year.