Executive Summary

This report analyzes Puzzle Wednesday data - the time it takes Branden and James working together as a team to complete the Sudoku, NY Times Crossword, and Regular Commuter Crossword in the Dallas Morning News Wednesday Edition.

The “Whose Paper” field indicates whose physical newspaper was used for that session (B = Branden brought his paper, J = James brought his paper) - not who completed the puzzles.

Key Findings:

The team is getting faster! Completion times are improving by ~0.3 minutes per month
Day of completion matters: Finishing on Wednesday (paper day) is significantly faster than delaying
Wind affects performance: Higher wind speeds correlate with slower times
Best conditions: Leo zodiac, New Moon, Saturdays
Worst conditions: Aquarius zodiac, First Quarter moon, Mondays
Paper source doesn’t matter: No significant difference whether using Branden’s or James’s newspaper

Data Import & Transformation

Load Raw Data

puzzle_raw <- read_excel("data.xlsx") %>%
  rename(
    paper_date = `Paper Date`,
    complete_date = `Complete Date`,
    time = Time,
    whose_paper = `Whose Paper?`,
    notes = Notes
  ) %>%
  mutate(
    paper_date = as.Date(paper_date),
    complete_date = as.Date(complete_date)
  )

cat("Total records:", nrow(puzzle_raw), "\n")

## Total records: 157

cat("Date range:", as.character(min(puzzle_raw$paper_date)), "to", as.character(max(puzzle_raw$paper_date)))

## Date range: 2017-01-10 to 2025-12-31

Convert Time to Seconds/Minutes

puzzle_data <- puzzle_raw %>%
  mutate(
    time_hms = format(time, "%H:%M:%S"),
    hours = as.numeric(substr(time_hms, 1, 2)),
    minutes = as.numeric(substr(time_hms, 4, 5)),
    seconds_part = as.numeric(substr(time_hms, 7, 8)),
    time_seconds = hours * 3600 + minutes * 60 + seconds_part,
    time_minutes = time_seconds / 60
  ) %>%
  select(-hours, -minutes, -seconds_part)

Add Zodiac Signs

get_zodiac_sign <- function(date) {
  month <- month(date)
  day <- day(date)

  if ((month == 3 && day >= 21) || (month == 4 && day <= 19)) return("Aries")
  if ((month == 4 && day >= 20) || (month == 5 && day <= 20)) return("Taurus")
  if ((month == 5 && day >= 21) || (month == 6 && day <= 20)) return("Gemini")
  if ((month == 6 && day >= 21) || (month == 7 && day <= 22)) return("Cancer")
  if ((month == 7 && day >= 23) || (month == 8 && day <= 22)) return("Leo")
  if ((month == 8 && day >= 23) || (month == 9 && day <= 22)) return("Virgo")
  if ((month == 9 && day >= 23) || (month == 10 && day <= 22)) return("Libra")
  if ((month == 10 && day >= 23) || (month == 11 && day <= 21)) return("Scorpio")
  if ((month == 11 && day >= 22) || (month == 12 && day <= 21)) return("Sagittarius")
  if ((month == 12 && day >= 22) || (month == 1 && day <= 19)) return("Capricorn")
  if ((month == 1 && day >= 20) || (month == 2 && day <= 18)) return("Aquarius")
  if ((month == 2 && day >= 19) || (month == 3 && day <= 20)) return("Pisces")

  return(NA_character_)
}

puzzle_data <- puzzle_data %>%
  mutate(zodiac_sign = sapply(complete_date, get_zodiac_sign))

Add Moon Phase Data

moon_data <- getMoonIllumination(date = puzzle_data$complete_date)

get_moon_phase_name <- function(phase) {
  if (phase < 0.025 || phase >= 0.975) return("New Moon")
  if (phase < 0.25) return("Waxing Crescent")
  if (phase < 0.275) return("First Quarter")
  if (phase < 0.50) return("Waxing Gibbous")
  if (phase < 0.525) return("Full Moon")
  if (phase < 0.75) return("Waning Gibbous")
  if (phase < 0.775) return("Last Quarter")
  return("Waning Crescent")
}

puzzle_data <- puzzle_data %>%
  mutate(
    moon_illumination = moon_data$fraction,
    moon_phase_value = moon_data$phase,
    moon_phase_name = sapply(moon_data$phase, get_moon_phase_name)
  )

Add Weather Data (Flower Mound, TX)

lat <- 33.0146
lon <- -97.0969

unique_dates <- sort(unique(puzzle_data$complete_date))
date_range_start <- min(unique_dates)
date_range_end <- max(unique_dates)

weather_url <- paste0(
  "https://archive-api.open-meteo.com/v1/archive?",
  "latitude=", lat,
  "&longitude=", lon,
  "&start_date=", date_range_start,
  "&end_date=", date_range_end,
  "&daily=temperature_2m_max,temperature_2m_min,temperature_2m_mean,",
  "precipitation_sum,rain_sum,windspeed_10m_max,weathercode",
  "&temperature_unit=fahrenheit",
  "&windspeed_unit=mph",
  "&precipitation_unit=inch",
  "&timezone=America/Chicago"
)

weather_response <- tryCatch({
  GET(weather_url, timeout(30))
}, error = function(e) NULL)

if (!is.null(weather_response) && status_code(weather_response) == 200) {
  weather_json <- fromJSON(content(weather_response, "text", encoding = "UTF-8"))

  get_weather_description <- function(code) {
    weather_codes <- c(
      "0" = "Clear sky", "1" = "Mainly clear", "2" = "Partly cloudy", "3" = "Overcast",
      "45" = "Foggy", "48" = "Depositing rime fog",
      "51" = "Light drizzle", "53" = "Moderate drizzle", "55" = "Dense drizzle",
      "61" = "Slight rain", "63" = "Moderate rain", "65" = "Heavy rain",
      "71" = "Slight snow", "73" = "Moderate snow", "75" = "Heavy snow",
      "80" = "Slight rain showers", "81" = "Moderate rain showers", "82" = "Violent rain showers",
      "95" = "Thunderstorm", "96" = "Thunderstorm with hail", "99" = "Thunderstorm with heavy hail"
    )
    code_str <- as.character(code)
    if (code_str %in% names(weather_codes)) return(weather_codes[code_str])
    return("Unknown")
  }

  weather_df <- data.frame(
    complete_date = as.Date(weather_json$daily$time),
    weather_temp_max_f = weather_json$daily$temperature_2m_max,
    weather_temp_min_f = weather_json$daily$temperature_2m_min,
    weather_temp_mean_f = weather_json$daily$temperature_2m_mean,
    weather_precipitation_in = weather_json$daily$precipitation_sum,
    weather_rain_in = weather_json$daily$rain_sum,
    weather_wind_max_mph = weather_json$daily$windspeed_10m_max,
    weather_code = weather_json$daily$weathercode
  ) %>%
    mutate(weather_description = sapply(weather_code, get_weather_description))

  puzzle_data <- puzzle_data %>%
    left_join(weather_df, by = "complete_date")
}

Add Stock Market Data

get_stock_performance <- function(symbol, start_date, end_date) {
  tryCatch({
    extended_start <- start_date - 7
    stock_data <- getSymbols(symbol, src = "yahoo", from = extended_start,
                             to = end_date + 1, auto.assign = FALSE, warnings = FALSE)
    stock_data <- stock_data[!is.na(Cl(stock_data)), ]
    prices <- Cl(stock_data)
    returns <- dailyReturn(prices)
    data.frame(
      date = index(prices),
      close = as.numeric(prices),
      daily_return_pct = as.numeric(returns) * 100
    )
  }, error = function(e) NULL)
}

# S&P 500
sp500_data <- get_stock_performance("^GSPC", date_range_start, date_range_end)
if (!is.null(sp500_data)) {
  puzzle_data <- puzzle_data %>%
    left_join(sp500_data %>% rename(complete_date = date, sp500_close = close,
                                     sp500_daily_return_pct = daily_return_pct),
              by = "complete_date")
}

# Dow Jones
djia_data <- get_stock_performance("^DJI", date_range_start, date_range_end)
if (!is.null(djia_data)) {
  puzzle_data <- puzzle_data %>%
    left_join(djia_data %>% rename(complete_date = date, djia_close = close,
                                    djia_daily_return_pct = daily_return_pct),
              by = "complete_date")
}

# NASDAQ
nasdaq_data <- get_stock_performance("^IXIC", date_range_start, date_range_end)
if (!is.null(nasdaq_data)) {
  puzzle_data <- puzzle_data %>%
    left_join(nasdaq_data %>% rename(complete_date = date, nasdaq_close = close,
                                      nasdaq_daily_return_pct = daily_return_pct),
              by = "complete_date")
}

# Gold
gold_data <- get_stock_performance("GC=F", date_range_start, date_range_end)
if (!is.null(gold_data)) {
  puzzle_data <- puzzle_data %>%
    left_join(gold_data %>% rename(complete_date = date, gold_close = close,
                                    gold_daily_return_pct = daily_return_pct),
              by = "complete_date")
}

# Oil
oil_data <- get_stock_performance("CL=F", date_range_start, date_range_end)
if (!is.null(oil_data)) {
  puzzle_data <- puzzle_data %>%
    left_join(oil_data %>% rename(complete_date = date, oil_wti_close = close,
                                   oil_daily_return_pct = daily_return_pct),
              by = "complete_date")
}

Add Holiday Proximity

get_us_holidays <- function(year) {
  c(
    as.Date(paste0(year, "-01-01")),
    as.Date(paste0(year, "-01-01")) + (15 + (8 - wday(as.Date(paste0(year, "-01-01")))) %% 7),
    as.Date(paste0(year, "-02-01")) + (15 + (8 - wday(as.Date(paste0(year, "-02-01")))) %% 7),
    as.Date(paste0(year, "-05-31")) - ((wday(as.Date(paste0(year, "-05-31"))) - 2) %% 7),
    as.Date(paste0(year, "-07-04")),
    as.Date(paste0(year, "-09-01")) + ((8 - wday(as.Date(paste0(year, "-09-01")))) %% 7),
    as.Date(paste0(year, "-11-01")) + (22 + (5 - wday(as.Date(paste0(year, "-11-01")))) %% 7),
    as.Date(paste0(year, "-12-25"))
  )
}

years <- unique(year(puzzle_data$complete_date))
all_holidays <- as.Date(unlist(lapply(years, get_us_holidays)), origin = "1970-01-01")

puzzle_data <- puzzle_data %>%
  mutate(
    days_to_nearest_holiday = sapply(complete_date, function(d) min(abs(as.numeric(d - all_holidays))))
  )

Add Derived Features

puzzle_data <- puzzle_data %>%
  mutate(
    day_of_week = wday(complete_date, label = TRUE, abbr = FALSE),
    days_delay = as.numeric(complete_date - paper_date),
    month = month(complete_date, label = TRUE, abbr = FALSE),
    year = year(complete_date),
    week_of_year = week(complete_date),
    completed_same_day = days_delay == 0,
    date_numeric = as.numeric(complete_date - min(complete_date)),
    holiday_proximity = case_when(
      days_to_nearest_holiday <= 3 ~ "Within 3 days",
      days_to_nearest_holiday <= 7 ~ "4-7 days",
      days_to_nearest_holiday <= 14 ~ "8-14 days",
      TRUE ~ "More than 14 days"
    ),
    holiday_proximity = factor(holiday_proximity,
                               levels = c("Within 3 days", "4-7 days", "8-14 days", "More than 14 days")),
    temp_category = case_when(
      weather_temp_mean_f < 50 ~ "Cold (<50F)",
      weather_temp_mean_f < 70 ~ "Mild (50-70F)",
      weather_temp_mean_f < 85 ~ "Warm (70-85F)",
      weather_temp_mean_f >= 85 ~ "Hot (85F+)"
    ),
    market_direction = case_when(
      is.na(sp500_daily_return_pct) ~ NA_character_,
      sp500_daily_return_pct < -1 ~ "Down >1%",
      sp500_daily_return_pct < 0 ~ "Slightly Down",
      sp500_daily_return_pct < 1 ~ "Slightly Up",
      TRUE ~ "Up >1%"
    )
  )

Add Sentiment Analysis on Notes

# Function to analyze sentiment of a text
analyze_sentiment <- function(text) {
  if (is.na(text) || text == "") {
    return(list(
      syuzhet_score = NA_real_,
      word_count = 0,
      sentiment_label = NA_character_
    ))
  }

  # Get sentiment score using syuzhet
syuzhet_score <- get_sentiment(text, method = "syuzhet")

  # Get word count
  words_df <- tibble(text = text) %>%
    unnest_tokens(word, text)
  word_count <- nrow(words_df)

  # Create sentiment label
  if (syuzhet_score > 0.5) {
    sentiment_label <- "Positive"
  } else if (syuzhet_score < -0.5) {
    sentiment_label <- "Negative"
  } else {
    sentiment_label <- "Neutral"
  }

  return(list(
    syuzhet_score = syuzhet_score,
    word_count = word_count,
    sentiment_label = sentiment_label
  ))
}

# Apply sentiment analysis to each note
sentiment_results <- lapply(puzzle_data$notes, analyze_sentiment)

# Extract results into columns
puzzle_data <- puzzle_data %>%
  mutate(
    note_sentiment_score = sapply(sentiment_results, function(x) x$syuzhet_score),
    note_word_count = sapply(sentiment_results, function(x) x$word_count),
    note_sentiment_label = sapply(sentiment_results, function(x) x$sentiment_label),
    has_note = !is.na(notes) & notes != ""
  )

# Get NRC emotion scores
get_nrc_emotions <- function(text) {
  if (is.na(text) || text == "") {
    return(data.frame(
      anger = NA, anticipation = NA, disgust = NA, fear = NA,
      joy = NA, sadness = NA, surprise = NA, trust = NA
    ))
  }
  emotions <- get_nrc_sentiment(text)
  return(emotions[, c("anger", "anticipation", "disgust", "fear",
                      "joy", "sadness", "surprise", "trust")])
}

nrc_emotions <- bind_rows(lapply(puzzle_data$notes, get_nrc_emotions))

puzzle_data <- puzzle_data %>%
  mutate(
    note_anger = nrc_emotions$anger,
    note_anticipation = nrc_emotions$anticipation,
    note_disgust = nrc_emotions$disgust,
    note_fear = nrc_emotions$fear,
    note_joy = nrc_emotions$joy,
    note_sadness = nrc_emotions$sadness,
    note_surprise = nrc_emotions$surprise,
    note_trust = nrc_emotions$trust
  )

# Summary
sentiment_summary <- puzzle_data %>%
  filter(!is.na(note_sentiment_label)) %>%
  count(note_sentiment_label)
cat("Sentiment distribution:\n")

## Sentiment distribution:

print(sentiment_summary)

## # A tibble: 3 × 2
##   note_sentiment_label     n
##   <chr>                <int>
## 1 Negative                29
## 2 Neutral                 94
## 3 Positive                24

Final Dataset Summary

cat("Final dataset:", nrow(puzzle_data), "records,", ncol(puzzle_data), "columns\n")

## Final dataset: 157 records, 53 columns

Descriptive Statistics

Overall Time Distribution

time_stats <- puzzle_data %>%
  summarise(
    N = n(),
    Mean = round(mean(time_minutes), 2),
    Median = round(median(time_minutes), 2),
    SD = round(sd(time_minutes), 2),
    Min = round(min(time_minutes), 2),
    Max = round(max(time_minutes), 2),
    Q1 = round(quantile(time_minutes, 0.25), 2),
    Q3 = round(quantile(time_minutes, 0.75), 2)
  )

kable(time_stats, caption = "Completion Time Statistics (minutes)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Completion Time Statistics (minutes)
N	Mean	Median	SD	Min	Max	Q1	Q3
157	42.75	41.92	10.47	23.45	84.98	34.9	48.37

ggplot(puzzle_data, aes(x = time_minutes)) +
  geom_histogram(bins = 25, fill = "steelblue", color = "white", alpha = 0.8) +
  geom_vline(aes(xintercept = mean(time_minutes)), color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = median(time_minutes)), color = "orange", linetype = "dashed", size = 1) +
  labs(
    title = "Distribution of Puzzle Completion Times",
    subtitle = "Red = Mean, Orange = Median",
    x = "Time (minutes)",
    y = "Count"
  ) +
  theme(plot.title = element_text(face = "bold", size = 14))

By Paper Source (B vs J)

The “Whose Paper” column indicates whose physical newspaper was used for that puzzle session. Both Branden and James work together as a team on all puzzles - this field simply tracks which person brought the paper that day. In some cases, using James’s paper resulted in accidentally breaking the Sudoku in an unrecoverable way.

paper_stats <- puzzle_data %>%
  group_by(whose_paper) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    `SD (min)` = round(sd(time_minutes), 2),
    `Min (min)` = round(min(time_minutes), 2),
    `Max (min)` = round(max(time_minutes), 2),
    .groups = "drop"
  )

kable(paper_stats, caption = "Team Completion Time by Paper Source (B = Branden's paper, J = James's paper)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Team Completion Time by Paper Source (B = Branden’s paper, J = James’s paper)
whose_paper	N	Mean (min)	Median (min)	SD (min)	Min (min)	Max (min)
B	141	42.89	41.97	10.41	23.45	84.98
J	16	41.46	39.65	11.20	24.08	68.27

ggplot(puzzle_data, aes(x = whose_paper, y = time_minutes, fill = whose_paper)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3) +
  scale_fill_brewer(palette = "Set1") +
  labs(
    title = "Team Completion Time by Paper Source",
    subtitle = "B = Branden's paper, J = James's paper",
    x = "Paper Source",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none")

Time Trend Analysis

Are We Getting Better?

trend_model <- lm(time_minutes ~ date_numeric, data = puzzle_data)
trend_summary <- summary(trend_model)

ggplot(puzzle_data, aes(x = complete_date, y = time_minutes, color = whose_paper)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_smooth(method = "lm", se = TRUE, aes(group = 1), color = "black", linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Team Completion Time Trend Over Time",
    subtitle = paste0("Overall trend: ", round(coef(trend_model)[2] * 30, 2),
                      " minutes/month (p = ", format(trend_summary$coefficients[2, 4], digits = 3), ")"),
    x = "Date",
    y = "Time (minutes)",
    color = "Paper Source"
  ) +
  theme(plot.title = element_text(face = "bold", size = 14))

trend_by_paper <- puzzle_data %>%
  group_by(whose_paper) %>%
  do({
    model <- lm(time_minutes ~ date_numeric, data = .)
    data.frame(
      `Monthly Change (min)` = round(coef(model)[2] * 30, 3),
      `p-value` = round(summary(model)$coefficients[2, 4], 4),
      Direction = ifelse(coef(model)[2] < 0, "Improving", "Getting Slower")
    )
  }) %>%
  ungroup()

kable(trend_by_paper, caption = "Time Trend by Paper Source") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Time Trend by Paper Source
whose_paper	Monthly.Change..min.	p.value	Direction
B	-0.292	0.0003	Improving
J	-0.398	0.1267	Improving

Key Finding: The team is improving over time regardless of whose paper is used, with times decreasing by approximately 0.3-0.4 minutes per month.

Correlation Analysis

Numeric Predictors

numeric_cols <- puzzle_data %>%
  select(
    time_minutes,
    moon_illumination,
    weather_temp_mean_f, weather_precipitation_in, weather_wind_max_mph,
    sp500_daily_return_pct, djia_daily_return_pct, nasdaq_daily_return_pct,
    gold_daily_return_pct, oil_daily_return_pct,
    days_to_nearest_holiday, days_delay
  ) %>%
  drop_na()

correlations <- cor(numeric_cols)[, "time_minutes"]
correlations <- correlations[names(correlations) != "time_minutes"]
correlations <- sort(correlations, decreasing = TRUE)

cor_df <- data.frame(
  Variable = names(correlations),
  Correlation = round(correlations, 4),
  `Abs Correlation` = abs(round(correlations, 4))
) %>%
  arrange(desc(`Abs.Correlation`))

kable(cor_df, caption = "Correlations with Completion Time") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(abs(cor_df$Correlation) > 0.1), bold = TRUE, background = "#ffffcc")

Correlations with Completion Time
	Variable	Correlation	Abs.Correlation
weather_wind_max_mph	weather_wind_max_mph	0.1629	0.1629
weather_precipitation_in	weather_precipitation_in	0.0975	0.0975
gold_daily_return_pct	gold_daily_return_pct	-0.0698	0.0698
moon_illumination	moon_illumination	-0.0698	0.0698
djia_daily_return_pct	djia_daily_return_pct	-0.0551	0.0551
days_delay	days_delay	0.0502	0.0502
sp500_daily_return_pct	sp500_daily_return_pct	-0.0460	0.0460
nasdaq_daily_return_pct	nasdaq_daily_return_pct	-0.0244	0.0244
oil_daily_return_pct	oil_daily_return_pct	0.0122	0.0122
days_to_nearest_holiday	days_to_nearest_holiday	-0.0112	0.0112
weather_temp_mean_f	weather_temp_mean_f	-0.0099	0.0099

cor_matrix <- cor(numeric_cols, use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper",
         tl.cex = 0.8, tl.col = "black",
         addCoef.col = "black", number.cex = 0.7,
         col = colorRampPalette(c("#6D9EC1", "white", "#E46726"))(200))

Key Finding: Wind speed shows the strongest correlation with completion time (r = 0.16). Windier days = slower puzzle times!

Categorical Variable Analysis

Day of Week

day_stats <- puzzle_data %>%
  group_by(day_of_week) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    `SD (min)` = round(sd(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(day_stats, caption = "Completion Time by Day of Week") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Completion Time by Day of Week
day_of_week	N	Mean (min)	Median (min)	SD (min)
Saturday	3	35.71	32.05	7.50
Wednesday	125	41.49	40.67	9.67
Friday	8	42.39	43.21	7.62
Tuesday	6	51.18	50.25	9.05
Thursday	10	51.38	46.52	15.35
Monday	5	51.65	50.10	11.48

day_order <- puzzle_data %>%
  group_by(day_of_week) %>%
  summarise(mean_time = mean(time_minutes)) %>%
  arrange(mean_time) %>%
  pull(day_of_week)

ggplot(puzzle_data, aes(x = factor(day_of_week, levels = day_order), y = time_minutes, fill = day_of_week)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Completion Time by Day of Week",
    subtitle = "ANOVA p = 0.003 (Significant!)",
    x = "Day",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_brewer(palette = "Set3")

Key Finding: Day of the week significantly affects completion time (p = 0.003). Completing on the paper day (Wednesday) is faster than delaying!

Same Day Completion

same_day_stats <- puzzle_data %>%
  group_by(completed_same_day) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  ) %>%
  mutate(completed_same_day = ifelse(completed_same_day, "Same Day", "Delayed"))

kable(same_day_stats, caption = "Same Day vs Delayed Completion") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Same Day vs Delayed Completion
completed_same_day	N	Mean (min)	Median (min)
Delayed	36	46.72	44.33
Same Day	121	41.57	40.67

ggplot(puzzle_data, aes(x = completed_same_day, y = time_minutes, fill = completed_same_day)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.2) +
  scale_fill_manual(values = c("TRUE" = "#2ecc71", "FALSE" = "#e74c3c")) +
  labs(
    title = "Same Day vs Delayed Completion",
    subtitle = "ANOVA p = 0.009 (Significant!)",
    x = "Completed Same Day",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none")

Key Finding: Completing puzzles on the same day as the paper is significantly faster (41.6 min vs 46.7 min, p = 0.009).

Zodiac Sign

zodiac_stats <- puzzle_data %>%
  group_by(zodiac_sign) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(zodiac_stats, caption = "Completion Time by Zodiac Sign") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Completion Time by Zodiac Sign
zodiac_sign	N	Mean (min)	Median (min)
Leo	14	40.32	38.14
Sagittarius	13	40.33	41.63
Capricorn	13	40.85	34.67
Libra	13	41.39	41.92
Taurus	11	42.02	44.00
Gemini	13	42.15	40.05
Aries	13	42.44	40.07
Pisces	14	43.01	44.12
Cancer	14	43.24	42.56
Scorpio	13	44.01	42.08
Virgo	13	44.40	37.82
Aquarius	13	48.85	46.77

ggplot(puzzle_data, aes(x = reorder(zodiac_sign, time_minutes), y = time_minutes)) +
  geom_boxplot(fill = "#9b59b6", alpha = 0.6) +
  coord_flip() +
  labs(
    title = "Completion Time by Zodiac Sign",
    subtitle = "ANOVA p = 0.80 (Not Significant)",
    x = "Zodiac Sign",
    y = "Time (minutes)"
  )

Moon Phase

moon_stats <- puzzle_data %>%
  group_by(moon_phase_name) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(moon_stats, caption = "Completion Time by Moon Phase") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Completion Time by Moon Phase
moon_phase_name	N	Mean (min)	Median (min)
New Moon	8	40.57	38.72
Waxing Gibbous	37	41.24	41.52
Last Quarter	6	41.40	38.40
Waning Gibbous	34	41.82	40.24
Full Moon	3	42.61	45.25
Waning Crescent	28	44.20	44.52
Waxing Crescent	40	44.40	43.13
First Quarter	1	49.37	49.37

ggplot(puzzle_data, aes(x = reorder(moon_phase_name, time_minutes), y = time_minutes)) +
  geom_boxplot(fill = "#34495e", alpha = 0.6) +
  coord_flip() +
  labs(
    title = "Completion Time by Moon Phase",
    subtitle = "ANOVA p = 0.85 (Not Significant)",
    x = "Moon Phase",
    y = "Time (minutes)"
  )

Weather Impact

Temperature

temp_stats <- puzzle_data %>%
  filter(!is.na(temp_category)) %>%
  group_by(temp_category) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(temp_stats, caption = "Completion Time by Temperature") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Completion Time by Temperature
temp_category	N	Mean (min)	Median (min)
Warm (70-85F)	55	41.08	38.38
Mild (50-70F)	52	42.73	43.46
Cold (<50F)	23	43.70	40.25
Hot (85F+)	27	45.37	43.82

ggplot(puzzle_data %>% filter(!is.na(weather_temp_mean_f)),
       aes(x = weather_temp_mean_f, y = time_minutes)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(
    title = "Completion Time vs Temperature",
    x = "Mean Temperature (F)",
    y = "Time (minutes)"
  )

Wind Speed

ggplot(puzzle_data %>% filter(!is.na(weather_wind_max_mph)),
       aes(x = weather_wind_max_mph, y = time_minutes)) +
  geom_point(alpha = 0.5, color = "#27ae60") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(
    title = "Completion Time vs Wind Speed",
    subtitle = paste("Correlation r =", round(cor(puzzle_data$time_minutes,
                                                   puzzle_data$weather_wind_max_mph,
                                                   use = "complete.obs"), 3)),
    x = "Max Wind Speed (mph)",
    y = "Time (minutes)"
  )

Key Finding: Higher wind speeds are associated with slower completion times (r = 0.16).

Holiday Proximity

holiday_stats <- puzzle_data %>%
  group_by(holiday_proximity) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  )

kable(holiday_stats, caption = "Completion Time by Holiday Proximity") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Completion Time by Holiday Proximity
holiday_proximity	N	Mean (min)	Median (min)
Within 3 days	21	42.64	42.98
4-7 days	22	42.67	40.88
8-14 days	42	44.08	43.64
More than 14 days	72	42.03	40.59

ggplot(puzzle_data, aes(x = holiday_proximity, y = time_minutes, fill = holiday_proximity)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Completion Time by Holiday Proximity",
    subtitle = "ANOVA p = 0.80 (Not Significant)",
    x = "Days to Nearest Holiday",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none") +
  scale_fill_brewer(palette = "Blues")

Finding: Holiday proximity does not significantly affect completion time.

Stock Market Impact

market_stats <- puzzle_data %>%
  filter(!is.na(market_direction)) %>%
  group_by(market_direction) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(market_stats, caption = "Completion Time by S&P 500 Direction") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Completion Time by S&P 500 Direction
market_direction	N	Mean (min)	Median (min)
Slightly Up	69	41.40	41.33
Down >1%	15	42.64	42.08
Up >1%	20	44.36	43.97
Slightly Down	49	44.55	42.48

ggplot(puzzle_data %>% filter(!is.na(market_direction)),
       aes(x = market_direction, y = time_minutes, fill = market_direction)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Completion Time by S&P 500 Performance",
    subtitle = "ANOVA p = 0.39 (Not Significant)",
    x = "Market Direction",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none") +
  scale_fill_brewer(palette = "RdYlGn")

Finding: Stock market performance does not significantly affect completion time.

Sentiment Analysis of Notes

Sentiment Distribution

sentiment_dist <- puzzle_data %>%
  filter(!is.na(note_sentiment_label)) %>%
  count(note_sentiment_label) %>%
  mutate(Percentage = round(n / sum(n) * 100, 1))

kable(sentiment_dist, col.names = c("Sentiment", "Count", "Percentage"),
      caption = "Distribution of Note Sentiments") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Distribution of Note Sentiments
Sentiment	Count	Percentage
Negative	29	19.7
Neutral	94	63.9
Positive	24	16.3

ggplot(sentiment_dist, aes(x = "", y = n, fill = note_sentiment_label)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  scale_fill_manual(values = c("Negative" = "#e74c3c", "Neutral" = "#95a5a6", "Positive" = "#2ecc71")) +
  labs(title = "Sentiment Distribution in Notes", fill = "Sentiment") +
  theme_void() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Sentiment vs Completion Time

sentiment_time <- puzzle_data %>%
  filter(!is.na(note_sentiment_label)) %>%
  group_by(note_sentiment_label) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    `SD (min)` = round(sd(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(sentiment_time, caption = "Completion Time by Note Sentiment") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Completion Time by Note Sentiment
note_sentiment_label	N	Mean (min)	Median (min)	SD (min)
Positive	24	40.72	40.39	10.77
Neutral	94	41.88	41.42	10.61
Negative	29	46.81	47.12	10.11

ggplot(puzzle_data %>% filter(!is.na(note_sentiment_label)),
       aes(x = note_sentiment_label, y = time_minutes, fill = note_sentiment_label)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3) +
  scale_fill_manual(values = c("Negative" = "#e74c3c", "Neutral" = "#95a5a6", "Positive" = "#2ecc71")) +
  labs(
    title = "Completion Time by Note Sentiment",
    subtitle = paste("ANOVA p =", round(summary(aov(time_minutes ~ note_sentiment_label,
                                                     data = puzzle_data %>%
                                                       filter(!is.na(note_sentiment_label))))[[1]][["Pr(>F)"]][1], 4)),
    x = "Sentiment",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none")

Sentiment Score vs Time

sentiment_cor <- cor(puzzle_data$time_minutes, puzzle_data$note_sentiment_score, use = "complete.obs")

ggplot(puzzle_data %>% filter(!is.na(note_sentiment_score)),
       aes(x = note_sentiment_score, y = time_minutes)) +
  geom_point(alpha = 0.5, aes(color = note_sentiment_label)) +
  geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed") +
  scale_color_manual(values = c("Negative" = "#e74c3c", "Neutral" = "#95a5a6", "Positive" = "#2ecc71")) +
  labs(
    title = "Completion Time vs Sentiment Score",
    subtitle = paste("Correlation r =", round(sentiment_cor, 3)),
    x = "Sentiment Score (negative = frustrated, positive = happy)",
    y = "Time (minutes)",
    color = "Sentiment"
  )

NRC Emotion Analysis

The NRC lexicon categorizes words into 8 emotions: anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.

emotion_cols <- c("note_anger", "note_anticipation", "note_disgust", "note_fear",
                  "note_joy", "note_sadness", "note_surprise", "note_trust")

emotion_cors <- sapply(emotion_cols, function(col) {
  cor(puzzle_data$time_minutes, puzzle_data[[col]], use = "complete.obs")
})

emotion_cor_df <- data.frame(
  Emotion = gsub("note_", "", names(emotion_cors)),
  Correlation = round(emotion_cors, 4)
) %>%
  arrange(desc(abs(Correlation)))

kable(emotion_cor_df, caption = "Emotion Correlations with Completion Time") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(abs(emotion_cor_df$Correlation) > 0.05), bold = TRUE, background = "#ffffcc")

Emotion Correlations with Completion Time
	Emotion	Correlation
note_anger	anger	0.2270
note_anticipation	anticipation	-0.1210
note_fear	fear	0.0831
note_sadness	sadness	0.0722
note_joy	joy	-0.0412
note_disgust	disgust	-0.0393
note_trust	trust	0.0226
note_surprise	surprise	0.0123

emotion_cor_df$Emotion <- factor(emotion_cor_df$Emotion, levels = emotion_cor_df$Emotion)

ggplot(emotion_cor_df, aes(x = reorder(Emotion, Correlation), y = Correlation,
                            fill = ifelse(Correlation > 0, "Positive", "Negative"))) +
  geom_bar(stat = "identity", alpha = 0.8) +
  coord_flip() +
  scale_fill_manual(values = c("Positive" = "#e74c3c", "Negative" = "#2ecc71")) +
  labs(
    title = "Emotion Correlations with Completion Time",
    subtitle = "Positive correlation = emotion associated with slower times",
    x = "Emotion",
    y = "Correlation with Time"
  ) +
  theme(legend.position = "none") +
  geom_hline(yintercept = 0, linetype = "dashed")

Sample Notes by Sentiment

Note. For the public usage of this, I have removed my notes because at times they can get quite colorful.

Word Cloud of Notes

# Get all words from notes
all_words <- puzzle_data %>%
  filter(!is.na(notes)) %>%
  select(notes) %>%
  unnest_tokens(word, notes) %>%
  anti_join(stop_words, by = "word") %>%
  count(word, sort = TRUE) %>%
  filter(n > 1)

# Create word cloud
if (nrow(all_words) > 0) {
  wordcloud(words = all_words$word, freq = all_words$n,
            min.freq = 2, max.words = 100,
            random.order = FALSE, rot.per = 0.35,
            colors = brewer.pal(8, "Dark2"))
}

Regression Analysis

Multiple Regression Model

# Note: whose_paper represents paper source (whose newspaper was used), not who completed the puzzles
# The team works together on all puzzles - this variable tests if paper source affects completion time
regression_data <- puzzle_data %>%
  filter(!is.na(weather_temp_mean_f) & !is.na(sp500_daily_return_pct)) %>%
  mutate(
    paper_source = factor(whose_paper),  # Renamed for clarity
    day_of_week = factor(day_of_week)
  )

full_model <- lm(
  time_minutes ~
    paper_source +
    days_delay +
    weather_temp_mean_f +
    weather_precipitation_in +
    weather_wind_max_mph +
    moon_illumination +
    sp500_daily_return_pct +
    days_to_nearest_holiday +
    date_numeric,
  data = regression_data
)

model_summary <- summary(full_model)

cat("R-squared:", round(model_summary$r.squared, 4), "\n")

## R-squared: 0.1127

cat("Adjusted R-squared:", round(model_summary$adj.r.squared, 4), "\n")

## Adjusted R-squared: 0.0568

cat("F-statistic p-value:", format(pf(model_summary$fstatistic[1],
                                       model_summary$fstatistic[2],
                                       model_summary$fstatistic[3],
                                       lower.tail = FALSE), digits = 4), "\n")

## F-statistic p-value: 0.04138

coef_table <- tidy(full_model) %>%
  mutate(
    estimate = round(estimate, 4),
    std.error = round(std.error, 4),
    statistic = round(statistic, 3),
    p.value = round(p.value, 4),
    Significant = ifelse(p.value < 0.05, "**", ifelse(p.value < 0.1, "*", ""))
  ) %>%
  arrange(p.value)

kable(coef_table, caption = "Multiple Regression Coefficients") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(coef_table$p.value < 0.05), bold = TRUE, background = "#d4edda") %>%
  row_spec(which(coef_table$p.value >= 0.05 & coef_table$p.value < 0.1), background = "#fff3cd")

Multiple Regression Coefficients
term	estimate	std.error	statistic	p.value	Significant
(Intercept)	47.3919	5.4921	8.629	0.0000	**
date_numeric	-0.0092	0.0027	-3.349	0.0010	**
weather_precipitation_in	1.9363	2.0644	0.938	0.3499
moon_illumination	-2.0632	2.4140	-0.855	0.3942
paper_sourceJ	-2.0730	2.7446	-0.755	0.4513
weather_wind_max_mph	0.1402	0.2115	0.663	0.5084
sp500_daily_return_pct	-0.2932	0.7567	-0.387	0.6990
days_delay	0.0013	0.0041	0.330	0.7419
days_to_nearest_holiday	-0.0195	0.0706	-0.275	0.7834
weather_temp_mean_f	0.0014	0.0549	0.025	0.9800

Random Forest Feature Importance

# Note: paper_source represents whose newspaper was used, not who completed the puzzles
rf_data <- puzzle_data %>%
  select(
    time_minutes, whose_paper, days_delay,
    weather_temp_mean_f, weather_precipitation_in, weather_wind_max_mph,
    moon_illumination, sp500_daily_return_pct, djia_daily_return_pct,
    nasdaq_daily_return_pct, days_to_nearest_holiday, date_numeric,
    gold_daily_return_pct, oil_daily_return_pct
  ) %>%
  rename(paper_source = whose_paper) %>%  # Renamed for clarity
  drop_na() %>%
  mutate(paper_source = factor(paper_source))

set.seed(42)
rf_model <- randomForest(time_minutes ~ ., data = rf_data, ntree = 500, importance = TRUE)

importance_df <- importance(rf_model) %>%
  as.data.frame() %>%
  rownames_to_column("Variable") %>%
  arrange(desc(`%IncMSE`)) %>%
  mutate(
    `%IncMSE` = round(`%IncMSE`, 2),
    IncNodePurity = round(IncNodePurity, 2)
  )

kable(importance_df, caption = "Random Forest Feature Importance") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Random Forest Feature Importance
Variable	%IncMSE	IncNodePurity
date_numeric	1.44	2359.46
sp500_daily_return_pct	0.60	1137.80
paper_source	0.16	132.00
weather_wind_max_mph	0.05	1526.89
days_delay	-0.38	789.55
moon_illumination	-0.53	1520.78
djia_daily_return_pct	-0.82	1001.26
days_to_nearest_holiday	-2.66	988.86
weather_precipitation_in	-3.06	777.06
oil_daily_return_pct	-3.22	1070.87
nasdaq_daily_return_pct	-3.81	1133.81
weather_temp_mean_f	-4.39	1575.64
gold_daily_return_pct	-4.65	1191.18

varImpPlot(rf_model, main = "Random Forest Feature Importance")

Key Finding: The most important predictor is date_numeric (time trend), confirming that completion times are improving over time.

Record Times

fastest <- puzzle_data %>% slice_min(time_minutes, n = 5)
slowest <- puzzle_data %>% slice_max(time_minutes, n = 5)

kable(
  fastest %>%
    select(complete_date, whose_paper, time_minutes, zodiac_sign, weather_description) %>%
    rename(`Paper Source` = whose_paper),
  caption = "Top 5 Fastest Team Times"
) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Top 5 Fastest Team Times
complete_date	Paper Source	time_minutes	zodiac_sign	weather_description
2024-06-05	B	23.45000	Gemini	Slight rain
2025-02-05	J	24.08333	Aquarius	Overcast
2025-09-10	B	24.40000	Virgo	Overcast
2024-08-21	B	24.85000	Leo	Overcast
2024-06-12	B	26.76667	Gemini	Moderate drizzle

kable(
  slowest %>%
    select(complete_date, whose_paper, time_minutes, zodiac_sign, weather_description, notes) %>%
    rename(`Paper Source` = whose_paper),
  caption = "Top 5 Slowest Team Times"
) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Top 5 Slowest Team Times
complete_date	Paper Source	time_minutes	zodiac_sign	weather_description	notes
2023-01-26	B	84.98333	Aquarius	Mainly clear	Bert showed up and distracted everyone.
2023-09-20	B	68.86667	Virgo	Light drizzle	Bert wasn’t even here.
2023-05-24	J	68.26667	Gemini	Slight rain	Fuckin Shortz
2025-01-02	B	67.66667	Capricorn	Clear sky	Yeah, not a great start. Sudoku and NYT killed us.
2025-07-14	B	64.33333	Cancer	Heavy rain	5am flight from MIA. G left the paper on the lawn.

Summary & Conclusions

Statistically Significant Findings

Factor	Effect	p-value
Time Trend	Improving ~0.3 min/month	< 0.001
Day of Week	Wednesday is fastest	0.003
Same Day Completion	5 minutes faster	0.009
Wind Speed	Higher wind = slower	r = 0.16

Sentiment Analysis Findings

# Calculate sentiment stats for summary
sentiment_summary_stats <- puzzle_data %>%
  filter(!is.na(note_sentiment_label)) %>%
  group_by(note_sentiment_label) %>%
  summarise(mean_time = mean(time_minutes), .groups = "drop")

sentiment_aov_p <- summary(aov(time_minutes ~ note_sentiment_label,
                               data = puzzle_data %>%
                                 filter(!is.na(note_sentiment_label))))[[1]][["Pr(>F)"]][1]

Note Sentiment shows trends in completion time (p = 0.057)
Negative notes (frustration) often correlate with harder puzzles or mistakes
Most common emotions detected: check the emotion analysis above

Not Significant

Holiday proximity (p = 0.80)
Zodiac sign (p = 0.80)
Moon phase (p = 0.85)
Temperature (p = 0.35)
Stock market (p = 0.39)
Paper source (B vs J) (p = 0.61) - no difference based on whose newspaper was used

Recommendations

Complete puzzles on Wednesday - Don’t delay!
Avoid windy days if you want a faster time
Keep practicing - The team is getting better!
Either paper works - No advantage to using Branden’s vs James’s newspaper

Appendix: Full Dataset

Note: “whose_paper” indicates whose physical newspaper was used (B = Branden’s paper, J = James’s paper). All puzzles were completed as a team effort by Branden and James working together.

DT::datatable(
  puzzle_data %>%
#    select(paper_date, complete_date, time_minutes, whose_paper,
#           zodiac_sign, moon_phase_name, weather_description,
#           sp500_daily_return_pct, notes) %>%
    select(paper_date, complete_date, time_minutes, whose_paper,
           zodiac_sign, moon_phase_name, weather_description,
           sp500_daily_return_pct) %>%
    mutate(time_minutes = round(time_minutes, 2),
           sp500_daily_return_pct = round(sp500_daily_return_pct, 2)) %>%
    rename(`Paper Source` = whose_paper),
  options = list(pageLength = 10, scrollX = TRUE),
  caption = "Full Puzzle Wednesday Dataset (Team Completion Times)"
)

Report generated on 2026-01-24 10:45:54.601529

Puzzle Wednesday Analysis

What Factors Affect Our Puzzle Completion Times?

Branden & James

January 24, 2026