COVID-19 California Statistics

# Wrangle data
covidDaily <- covid %>%
  filter(state == "California") %>%
  group_by(county, state) %>%
  mutate(dailyCases = cases - lag(cases)) %>%
  ungroup()

# Top cases
knitr::kable(
  covidDaily %>%
    filter(date == max(date)) %>%
    group_by(county) %>%
    summarize(mostCases = max(cases)) %>%
    arrange(-mostCases) %>%
    head(5), 
  col.names = c("County", "Confirmed Cases"),
  caption = paste("Top COVID-19 cases by county (",max(covidDaily$date),")")
)

# Daily change in cases
knitr::kable(
  covidDaily %>%
    filter(date == max(date)) %>%
    group_by(county) %>%
    summarize(dailyCases) %>%
    arrange(-dailyCases) %>%
    head(5),
  col.names = c("County", "Daily Change"),
  caption = paste("New COVID-19 cases by county (",max(covidDaily$date),")")
)

PopulationEstimates <- read_excel("data/PopulationEstimates.xls") # Read in dataset
names(PopulationEstimates) <- PopulationEstimates[2,] # Set headers
PopulationEstimates <- PopulationEstimates[-c(1:2),] # Remove first two rows

# Filter by California counties, excluding state population
PopEstCA <- PopulationEstimates %>% filter(State == "CA", Area_Name != "California")

# Get only counties and 2019 population estimate
PopEstCA2019 <- PopEstCA %>% select(Area_Name, POP_ESTIMATE_2019)

# Remove "County" from each Area_Name value
PopEstCA2019 <- PopEstCA2019 %>%
  mutate(county = str_remove_all(Area_Name, regex(".County")))

covidCAPop <- covidDaily %>%
  group_by(county) %>%
  left_join(PopEstCA2019, by = "county")

# Most cumulative cases per capita
knitr::kable(
  covidCAPop %>%
    filter(date == max(date)) %>%
    group_by(county) %>%
    mutate(casesPerCapita = cases/as.double(POP_ESTIMATE_2019)) %>%
    summarize(casesPerCapita = max(casesPerCapita)) %>%
    arrange(-casesPerCapita) %>%
    ungroup() %>%
    head(5),
  col.names = c("County", "Cases per Capita"),
  caption = paste("Most Cumulative COVID-19 Cases per Capita (",max(covidCAPop$date),")")
)

# most new cases per capita
knitr::kable(
  covidCAPop %>%
    filter(date == max(date)) %>%
    group_by(county) %>%
    mutate(casesPerCapita = dailyCases/as.double(POP_ESTIMATE_2019)) %>%
    summarize(casesPerCapita = max(casesPerCapita)) %>%
    arrange(-casesPerCapita) %>%
    ungroup() %>%
    head(5),
  col.names = c("County", "New Cases per Capita"),
  caption = paste("Most New COVID-19 Cases per Capita (",max(covidCAPop$date),")")
)

# Newest cases per 100,000 people for last 14 days
knitr::kable(
  covidCAPop %>%
    filter(date > max(date)-14, county != "Unknown") %>%
    group_by(date) %>%
    summarize(newCases = sum(cases, na.rm = TRUE)/100000) %>%
    ungroup() %>%
    arrange(-newCases),
  col.names = c("Date", "New Cases per 100,000 people"),
  caption = paste("Newest COVID-19 Cases per 100,000 people for the last 14 days")
)

options(scipen=999) # Do not use scientific notation

# Get total cases
covidDaily %>%
  filter(date == max(date)) %>%
  group_by(date) %>%
  summarize(cases = sum(cases, na.rm = TRUE)) %>%
  ungroup() %>%
  pull(cases) ->
  totalCases

# Get total new cases
covidDaily %>%
  filter(date == max(date)) %>%
  group_by(date) %>%
  summarize(dailyCases = sum(dailyCases, na.rm = TRUE)) %>%
  ungroup() %>%
  pull(dailyCases) ->
  newCases

# Get safe counties
covidDaily %>%
  filter(date > max(date)-14, cases <= 0) %>%
  arrange(dailyCases) %>%
  count() %>%
  pull(n) ->
  safeCounties
Top COVID-19 cases by county ( 2020-08-15 )
County Confirmed Cases
Los Angeles 220762
Riverside 46456
Orange 43367
San Bernardino 40442
San Diego 34407
New COVID-19 cases by county ( 2020-08-15 )
County Daily Change
Los Angeles 2069
San Bernardino 1047
Riverside 794
Fresno 743
Orange 513
Most Cumulative COVID-19 Cases per Capita ( 2020-08-15 )
County Cases per Capita
Imperial 0.0555031
Kings 0.0353668
Kern 0.0291779
Tulare 0.0259655
Merced 0.0244058
Most New COVID-19 Cases per Capita ( 2020-08-15 )
County New Cases per Capita
Inyo 0.0009978
Stanislaus 0.0007736
Fresno 0.0007437
Kings 0.0006473
San Bernardino 0.0004803
Newest COVID-19 Cases per 100,000 people for the last 14 days
Date New Cases per 100,000 people
2020-08-15 6.21981
2020-08-14 6.13243
2020-08-13 6.03212
2020-08-12 5.95097
2020-08-11 5.86078
2020-08-10 5.74267
2020-08-09 5.63244
2020-08-08 5.56158
2020-08-07 5.48142
2020-08-06 5.41013
2020-08-05 5.32776
2020-08-04 5.27258
2020-08-03 5.22235
2020-08-02 5.15937

As of 2020-08-15, there were 621981 confirmed COVID-19 cases and 8738 new confirmed COVID-19 cases.

Determined by having zero cases, there are 0 safe counties in California.

New Cases in NY, CA, LA, and FL

stateList = c("New York", "California", "Louisiana", "Florida")

covid %>%
  group_by(date,state) %>%
  summarize(stateCases = sum(cases, na.rm = TRUE)) %>%
  filter(state %in% stateList) %>%
  ungroup() %>%
  group_by(state) %>%
  mutate(dailyCases = stateCases - lag(stateCases), rollingMean = rollmean(dailyCases,7,fill=NA,align="right")) %>%
  ungroup() ->
  dailyAndRolling

dailyAndRolling %>%
  ggplot(aes(x=date, fill=state)) +
  geom_bar(aes(y=dailyCases),stat="identity") +
  geom_line(aes(y=rollingMean)) + 
  facet_wrap(~state) +
  labs(title = "New COVID-19 Cases in Selected States Over Time",
       x = "", 
       y = "Cases per day",
       subtitle = "Source: NYTimes") +
  theme_bw()

# Get selected states pop est
PopulationEstimates %>%
  filter(Area_Name %in% c('New York', 'California', 'Louisiana', 'Florida')) %>%
  mutate(state = Area_Name) %>%
  select(state, POP_ESTIMATE_2019) ->
  totalPopEst

# Join pop ests to covid data
dailyAndRolling %>%
  group_by(state) %>%
  left_join(totalPopEst, by = "state") %>%
  mutate(casesPerCapita = dailyCases/as.numeric(POP_ESTIMATE_2019), rollingMeanPerCapita = rollmean(casesPerCapita,7,fill=NA,align="right")) %>%
  ungroup() ->
  statePerCapitaDailyAndRolling

# Plot
statePerCapitaDailyAndRolling %>%
  ggplot(aes(x=date, fill=state)) +
  geom_bar(aes(y=casesPerCapita),stat="identity") +
  geom_line(aes(y=rollingMeanPerCapita)) + 
  scale_fill_brewer(palette="Dark2") +
  facet_wrap(~state) +
  labs(title = "New COVID-19 Cases per Capita in Selected States Over Time",
       x = "", 
       y = "Cases per capita per day",
       subtitle = "Source: NYTimes") +
  theme_bw()

The primary indicator when analyzing the difference between cases per capita and pure cases is the population estimate.

knitr::kable(
  totalPopEst %>%
    summarize(state, POP_ESTIMATE_2019),
  col.names = c("State", "Population Estimate"),
  caption = paste("Total Population Estimates per State (2019)")
)
Total Population Estimates per State (2019)
State Population Estimate
California 39512223
Florida 21477737
Louisiana 4648794
New York 19453561

Based on new confirmed cases per capita versus purely new cases, California’s situation seems more in control with its population in comparison to Louisiana, which has a significantly worse situation when relating its population to number of confirmed cases. Both Florida and New York seem to have proportionately the expected number of cases in relation to their population.

Weighted Mean Center across the USA

options(scipen=999) # Do not use scientific notation
readr::read_csv("data/county-centroids.csv") %>%
  select(fips, name, LON, LAT) ->
  countyCentroids

covid %>%
  filter(county != "Unknown") %>%
  mutate(fips = case_when(
    county == "New York City" ~ "36061",
    county == "Joplin" ~ "29097",
    county == "Kansas City" ~ "29047",
    !(county %in% c("New York City", "Joplin", "Kansas City")) ~ fips
  )) %>%
  full_join(countyCentroids, by = c("fips")) %>%
  group_by(date, county) %>%
  mutate(cumulativeCases = sum(cases, na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(date) %>%
  mutate(totalCasesPerDay = sum(cases, na.rm = TRUE),
         x_coord = sum(LON * cases, na.rm = TRUE)/sum(cases, na.rm = TRUE),
         y_coord = sum(LAT * cases, na.rm = TRUE)/sum(cases, na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(format(date,"%m")) %>%
  mutate(totalCasesPerMonth = sum(cases, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(!(is.na(date))) %>%
  ggplot(aes(x = LON, y = LAT)) +
  borders("state", fill = "gray90", colour = "white") +
  geom_point(aes(x = x_coord, y = y_coord, colour = format(date, "%m"), size = totalCasesPerDay), alpha = 0.5) +
  labs(title = "Weighted Mean Center for COVID-19 Cases across the United States",
       subtitle = "Source: NYTimes",
       x = "Longitude",
       y = "Latitude",
       size = "Confirmed Cases per Day",
       colour = "Month") +
  scale_color_hue(labels = c(month.name)) +
  theme_void()

The movement of the COVID-19 weighted mean center directly correlates to where breakouts of COVID-19 cases are occurring. From January, notice that the first points occur in the north west, particularly due to the initial outbreak in Seattle, Washington. As time passes, the weighted mean center moves toward the east coast, due to the outbreak in New York in April. As we approach today, the weighted mean center is beginning to shift toward the mid-west, as we can see the current outbreaks happening within California, Florida, and Texas.