The flow of the data looks like this:
Load individual station id (
newdat
-lists all dates, and 5 closest stations (s1-s5
)For an individual station check the
value
variable for each day for conditions described above (e.g. outlier, NA, or -9999)When condition is met, grab the location in the data frame of element, year, month, and day from
newdat
.Subset
s1-s5
based on those criteria (this will leave 1 or 0 obs for eachs1-s5
Stack
s1-s5
and use either idw if greater than 2 obs exist, use single station if only 1 obs exists, or mean of the month and year if no obs existIf using idw, it is necessary to use the lat/long for
station
and (s1-s5
).Use this value to plug in to
newdat
where thevalue
was found to be an outlier, NA, or -9999.Move on to next day
newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = structure(c(1L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"),
year = c("1900", "1900", "1900", "1900", "1900", "1900",
"1900", "1900", "1900", "1900"), month = c("01", "01", "02",
"02", "03", "04", "04", "05", "05", "01"), day = c("01",
"01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567,
-25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447,
-25566), class = "Date"), value = c(30.02, 28, 37.94, 10.94,
28.04200, 25, 41, 82.04, 51.08, NA)), .Names = c("id", "element",
"year", "month", "day", "date", "value"), row.names = c(NA, 10L
), class = "data.frame")
s1 <- structure(list(id = "USC00031152", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 37.04, y = 33.59, x = -92.8236), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s2 <- structure(list(id = "USC00034638", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 62.06, y = 34.7392, x = -90.7664), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s3 <- structure(list(id = "USC00036352", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 30.92, y = 35.2833, x = -93.1), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s4 <- structure(list(id = "USC00036248", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 30.02, y = 36.3667, x = -94.1), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s5 <- structure(list(id = "USC00035112", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = -9999, y = 33.9294, x = -93.8583), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
station <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4), class = "data.frame", row.names = c(NA,
-1L), .Names = c("id", "lat", "long"))
newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = structure(c(1L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"),
year = c("1900", "1900", "1900", "1900", "1900", "1900",
"1900", "1900", "1900", "1900"), month = c("01", "01", "02",
"02", "03", "04", "04", "05", "05", "01"), day = c("01",
"01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567,
-25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447,
-25566), class = "Date"), value = c(30.02, 28, 37.94, 10.94,
28.04, 25, 41, 82.04, 51.08, NA)), .Names = c("id", "element",
"year", "month", "day", "date", "value"), row.names = c(NA, 10L
), class = "data.frame")
s1 <- structure(list(id = "USC00031152", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 37.04, y = 33.59, x = -92.8236), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s2 <- structure(list(id = "USC00034638", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 62.06, y = 34.7392, x = -90.7664), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s3 <- structure(list(id = "USC00036352", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 30.92, y = 35.2833, x = -93.1), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s4 <- structure(list(id = "USC00036248", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 30.02, y = 36.3667, x = -94.1), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s5 <- structure(list(id = "USC00035112", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = -9999, y = 33.9294, x = -93.8583), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
station <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4), class = "data.frame", row.names = c(NA,
-1L), .Names = c("id", "lat", "long"))
The flow of the data looks like this:
Load individual station id (
newdat
-lists all dates, and 5 closest stations (s1-s5
)For an individual station check the
value
variable for each day for conditions described above (e.g. outlier, NA, or -9999)When condition is met, grab the location in the data frame of element, year, month, and day from
newdat
.Subset
s1-s5
based on those criteria (this will leave 1 or 0 obs for eachs1-s5
Stack
s1-s5
and use either idw if greater than 2 obs exist, use single station if only 1 obs exists, or mean of the month and year if no obs existIf using idw, it is necessary to use the lat/long for
station
and (s1-s5
).Use this value to plug in to
newdat
where thevalue
was found to be an outlier, NA, or -9999.Move on to next day
newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = structure(c(1L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"),
year = c("1900", "1900", "1900", "1900", "1900", "1900",
"1900", "1900", "1900", "1900"), month = c("01", "01", "02",
"02", "03", "04", "04", "05", "05", "01"), day = c("01",
"01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567,
-25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447,
-25566), class = "Date"), value = c(30.02, 28, 37.94, 10.94,
200, 25, 41, 82.04, 51.08, NA)), .Names = c("id", "element",
"year", "month", "day", "date", "value"), row.names = c(NA, 10L
), class = "data.frame")
s1 <- structure(list(id = "USC00031152", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 37.04, y = 33.59, x = -92.8236), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s2 <- structure(list(id = "USC00034638", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 62.06, y = 34.7392, x = -90.7664), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s3 <- structure(list(id = "USC00036352", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 30.92, y = 35.2833, x = -93.1), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s4 <- structure(list(id = "USC00036248", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = 30.02, y = 36.3667, x = -94.1), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
s5 <- structure(list(id = "USC00035112", element = "TMAX", year = 1900,
month = 1, day = 2, date = structure(-25566, class = "Date"),
value = -9999, y = 33.9294, x = -93.8583), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("id",
"element", "year", "month", "day", "date", "value", "y", "x"))
station <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4), class = "data.frame", row.names = c(NA,
-1L), .Names = c("id", "lat", "long"))
The main idea is to build some fine scale weather data using inverse distance weighting (idw) from the 5 closest stations of the observed station. If the value
in the newdat
data set is greater than 134, less than -80, equal to -9999, or NA
then run the function to get an interpolated value. If greater than 2 stations exist usinguse idw, use the closest station if only 1 station exists use the closest station, or if no close stations exist use the mean for the month of the year if no close stations exist.
The main idea is to build some fine scale weather data using inverse distance weighting (idw) from the 5 closest stations of the observed station. If the value
in the newdat
data set is greater than 134, less than -80, equal to -9999, or NA
then run the function to get an interpolated value. If greater than 2 stations exist using idw, use the closest station if only 1 station exists, or use the mean for the month of the year if no close stations exist.
The main idea is to build some fine scale weather data using inverse distance weighting (idw) from the 5 closest stations of the observed station. If the value
in the newdat
data set is greater than 134, less than -80, equal to -9999, or NA
then run the function to get an interpolated value. If greater than 2 stations exist use idw, if only 1 station exists use the closest station, or if no close stations exist use the mean for the month of the year.