Executive Summary

This report analyzes Puzzle Wednesday data - the time it takes Branden and James working together as a team to complete the Sudoku, NY Times Crossword, and Regular Commuter Crossword in the Dallas Morning News Wednesday Edition.

The “Whose Paper” field indicates whose physical newspaper was used for that session (B = Branden brought his paper, J = James brought his paper) - not who completed the puzzles.

Key Findings:

  • The team is getting faster! Completion times are improving by ~0.3 minutes per month
  • Day of completion matters: Finishing on Wednesday (paper day) is significantly faster than delaying
  • Wind affects performance: Higher wind speeds correlate with slower times
  • Best conditions: Leo zodiac, New Moon, Saturdays
  • Worst conditions: Aquarius zodiac, First Quarter moon, Mondays
  • Paper source doesn’t matter: No significant difference whether using Branden’s or James’s newspaper

Data Import & Transformation

Load Raw Data

puzzle_raw <- read_excel("data.xlsx") %>%
  rename(
    paper_date = `Paper Date`,
    complete_date = `Complete Date`,
    time = Time,
    whose_paper = `Whose Paper?`,
    notes = Notes
  ) %>%
  mutate(
    paper_date = as.Date(paper_date),
    complete_date = as.Date(complete_date)
  )

cat("Total records:", nrow(puzzle_raw), "\n")
## Total records: 157
cat("Date range:", as.character(min(puzzle_raw$paper_date)), "to", as.character(max(puzzle_raw$paper_date)))
## Date range: 2017-01-10 to 2025-12-31

Convert Time to Seconds/Minutes

puzzle_data <- puzzle_raw %>%
  mutate(
    time_hms = format(time, "%H:%M:%S"),
    hours = as.numeric(substr(time_hms, 1, 2)),
    minutes = as.numeric(substr(time_hms, 4, 5)),
    seconds_part = as.numeric(substr(time_hms, 7, 8)),
    time_seconds = hours * 3600 + minutes * 60 + seconds_part,
    time_minutes = time_seconds / 60
  ) %>%
  select(-hours, -minutes, -seconds_part)

Add Zodiac Signs

get_zodiac_sign <- function(date) {
  month <- month(date)
  day <- day(date)

  if ((month == 3 && day >= 21) || (month == 4 && day <= 19)) return("Aries")
  if ((month == 4 && day >= 20) || (month == 5 && day <= 20)) return("Taurus")
  if ((month == 5 && day >= 21) || (month == 6 && day <= 20)) return("Gemini")
  if ((month == 6 && day >= 21) || (month == 7 && day <= 22)) return("Cancer")
  if ((month == 7 && day >= 23) || (month == 8 && day <= 22)) return("Leo")
  if ((month == 8 && day >= 23) || (month == 9 && day <= 22)) return("Virgo")
  if ((month == 9 && day >= 23) || (month == 10 && day <= 22)) return("Libra")
  if ((month == 10 && day >= 23) || (month == 11 && day <= 21)) return("Scorpio")
  if ((month == 11 && day >= 22) || (month == 12 && day <= 21)) return("Sagittarius")
  if ((month == 12 && day >= 22) || (month == 1 && day <= 19)) return("Capricorn")
  if ((month == 1 && day >= 20) || (month == 2 && day <= 18)) return("Aquarius")
  if ((month == 2 && day >= 19) || (month == 3 && day <= 20)) return("Pisces")

  return(NA_character_)
}

puzzle_data <- puzzle_data %>%
  mutate(zodiac_sign = sapply(complete_date, get_zodiac_sign))

Add Moon Phase Data

moon_data <- getMoonIllumination(date = puzzle_data$complete_date)

get_moon_phase_name <- function(phase) {
  if (phase < 0.025 || phase >= 0.975) return("New Moon")
  if (phase < 0.25) return("Waxing Crescent")
  if (phase < 0.275) return("First Quarter")
  if (phase < 0.50) return("Waxing Gibbous")
  if (phase < 0.525) return("Full Moon")
  if (phase < 0.75) return("Waning Gibbous")
  if (phase < 0.775) return("Last Quarter")
  return("Waning Crescent")
}

puzzle_data <- puzzle_data %>%
  mutate(
    moon_illumination = moon_data$fraction,
    moon_phase_value = moon_data$phase,
    moon_phase_name = sapply(moon_data$phase, get_moon_phase_name)
  )

Add Weather Data (Flower Mound, TX)

lat <- 33.0146
lon <- -97.0969

unique_dates <- sort(unique(puzzle_data$complete_date))
date_range_start <- min(unique_dates)
date_range_end <- max(unique_dates)

weather_url <- paste0(
  "https://archive-api.open-meteo.com/v1/archive?",
  "latitude=", lat,
  "&longitude=", lon,
  "&start_date=", date_range_start,
  "&end_date=", date_range_end,
  "&daily=temperature_2m_max,temperature_2m_min,temperature_2m_mean,",
  "precipitation_sum,rain_sum,windspeed_10m_max,weathercode",
  "&temperature_unit=fahrenheit",
  "&windspeed_unit=mph",
  "&precipitation_unit=inch",
  "&timezone=America/Chicago"
)

weather_response <- tryCatch({
  GET(weather_url, timeout(30))
}, error = function(e) NULL)

if (!is.null(weather_response) && status_code(weather_response) == 200) {
  weather_json <- fromJSON(content(weather_response, "text", encoding = "UTF-8"))

  get_weather_description <- function(code) {
    weather_codes <- c(
      "0" = "Clear sky", "1" = "Mainly clear", "2" = "Partly cloudy", "3" = "Overcast",
      "45" = "Foggy", "48" = "Depositing rime fog",
      "51" = "Light drizzle", "53" = "Moderate drizzle", "55" = "Dense drizzle",
      "61" = "Slight rain", "63" = "Moderate rain", "65" = "Heavy rain",
      "71" = "Slight snow", "73" = "Moderate snow", "75" = "Heavy snow",
      "80" = "Slight rain showers", "81" = "Moderate rain showers", "82" = "Violent rain showers",
      "95" = "Thunderstorm", "96" = "Thunderstorm with hail", "99" = "Thunderstorm with heavy hail"
    )
    code_str <- as.character(code)
    if (code_str %in% names(weather_codes)) return(weather_codes[code_str])
    return("Unknown")
  }

  weather_df <- data.frame(
    complete_date = as.Date(weather_json$daily$time),
    weather_temp_max_f = weather_json$daily$temperature_2m_max,
    weather_temp_min_f = weather_json$daily$temperature_2m_min,
    weather_temp_mean_f = weather_json$daily$temperature_2m_mean,
    weather_precipitation_in = weather_json$daily$precipitation_sum,
    weather_rain_in = weather_json$daily$rain_sum,
    weather_wind_max_mph = weather_json$daily$windspeed_10m_max,
    weather_code = weather_json$daily$weathercode
  ) %>%
    mutate(weather_description = sapply(weather_code, get_weather_description))

  puzzle_data <- puzzle_data %>%
    left_join(weather_df, by = "complete_date")
}

Add Stock Market Data

get_stock_performance <- function(symbol, start_date, end_date) {
  tryCatch({
    extended_start <- start_date - 7
    stock_data <- getSymbols(symbol, src = "yahoo", from = extended_start,
                             to = end_date + 1, auto.assign = FALSE, warnings = FALSE)
    stock_data <- stock_data[!is.na(Cl(stock_data)), ]
    prices <- Cl(stock_data)
    returns <- dailyReturn(prices)
    data.frame(
      date = index(prices),
      close = as.numeric(prices),
      daily_return_pct = as.numeric(returns) * 100
    )
  }, error = function(e) NULL)
}

# S&P 500
sp500_data <- get_stock_performance("^GSPC", date_range_start, date_range_end)
if (!is.null(sp500_data)) {
  puzzle_data <- puzzle_data %>%
    left_join(sp500_data %>% rename(complete_date = date, sp500_close = close,
                                     sp500_daily_return_pct = daily_return_pct),
              by = "complete_date")
}

# Dow Jones
djia_data <- get_stock_performance("^DJI", date_range_start, date_range_end)
if (!is.null(djia_data)) {
  puzzle_data <- puzzle_data %>%
    left_join(djia_data %>% rename(complete_date = date, djia_close = close,
                                    djia_daily_return_pct = daily_return_pct),
              by = "complete_date")
}

# NASDAQ
nasdaq_data <- get_stock_performance("^IXIC", date_range_start, date_range_end)
if (!is.null(nasdaq_data)) {
  puzzle_data <- puzzle_data %>%
    left_join(nasdaq_data %>% rename(complete_date = date, nasdaq_close = close,
                                      nasdaq_daily_return_pct = daily_return_pct),
              by = "complete_date")
}

# Gold
gold_data <- get_stock_performance("GC=F", date_range_start, date_range_end)
if (!is.null(gold_data)) {
  puzzle_data <- puzzle_data %>%
    left_join(gold_data %>% rename(complete_date = date, gold_close = close,
                                    gold_daily_return_pct = daily_return_pct),
              by = "complete_date")
}

# Oil
oil_data <- get_stock_performance("CL=F", date_range_start, date_range_end)
if (!is.null(oil_data)) {
  puzzle_data <- puzzle_data %>%
    left_join(oil_data %>% rename(complete_date = date, oil_wti_close = close,
                                   oil_daily_return_pct = daily_return_pct),
              by = "complete_date")
}

Add Holiday Proximity

get_us_holidays <- function(year) {
  c(
    as.Date(paste0(year, "-01-01")),
    as.Date(paste0(year, "-01-01")) + (15 + (8 - wday(as.Date(paste0(year, "-01-01")))) %% 7),
    as.Date(paste0(year, "-02-01")) + (15 + (8 - wday(as.Date(paste0(year, "-02-01")))) %% 7),
    as.Date(paste0(year, "-05-31")) - ((wday(as.Date(paste0(year, "-05-31"))) - 2) %% 7),
    as.Date(paste0(year, "-07-04")),
    as.Date(paste0(year, "-09-01")) + ((8 - wday(as.Date(paste0(year, "-09-01")))) %% 7),
    as.Date(paste0(year, "-11-01")) + (22 + (5 - wday(as.Date(paste0(year, "-11-01")))) %% 7),
    as.Date(paste0(year, "-12-25"))
  )
}

years <- unique(year(puzzle_data$complete_date))
all_holidays <- as.Date(unlist(lapply(years, get_us_holidays)), origin = "1970-01-01")

puzzle_data <- puzzle_data %>%
  mutate(
    days_to_nearest_holiday = sapply(complete_date, function(d) min(abs(as.numeric(d - all_holidays))))
  )

Add Derived Features

puzzle_data <- puzzle_data %>%
  mutate(
    day_of_week = wday(complete_date, label = TRUE, abbr = FALSE),
    days_delay = as.numeric(complete_date - paper_date),
    month = month(complete_date, label = TRUE, abbr = FALSE),
    year = year(complete_date),
    week_of_year = week(complete_date),
    completed_same_day = days_delay == 0,
    date_numeric = as.numeric(complete_date - min(complete_date)),
    holiday_proximity = case_when(
      days_to_nearest_holiday <= 3 ~ "Within 3 days",
      days_to_nearest_holiday <= 7 ~ "4-7 days",
      days_to_nearest_holiday <= 14 ~ "8-14 days",
      TRUE ~ "More than 14 days"
    ),
    holiday_proximity = factor(holiday_proximity,
                               levels = c("Within 3 days", "4-7 days", "8-14 days", "More than 14 days")),
    temp_category = case_when(
      weather_temp_mean_f < 50 ~ "Cold (<50F)",
      weather_temp_mean_f < 70 ~ "Mild (50-70F)",
      weather_temp_mean_f < 85 ~ "Warm (70-85F)",
      weather_temp_mean_f >= 85 ~ "Hot (85F+)"
    ),
    market_direction = case_when(
      is.na(sp500_daily_return_pct) ~ NA_character_,
      sp500_daily_return_pct < -1 ~ "Down >1%",
      sp500_daily_return_pct < 0 ~ "Slightly Down",
      sp500_daily_return_pct < 1 ~ "Slightly Up",
      TRUE ~ "Up >1%"
    )
  )

Add Sentiment Analysis on Notes

# Function to analyze sentiment of a text
analyze_sentiment <- function(text) {
  if (is.na(text) || text == "") {
    return(list(
      syuzhet_score = NA_real_,
      word_count = 0,
      sentiment_label = NA_character_
    ))
  }

  # Get sentiment score using syuzhet
syuzhet_score <- get_sentiment(text, method = "syuzhet")

  # Get word count
  words_df <- tibble(text = text) %>%
    unnest_tokens(word, text)
  word_count <- nrow(words_df)

  # Create sentiment label
  if (syuzhet_score > 0.5) {
    sentiment_label <- "Positive"
  } else if (syuzhet_score < -0.5) {
    sentiment_label <- "Negative"
  } else {
    sentiment_label <- "Neutral"
  }

  return(list(
    syuzhet_score = syuzhet_score,
    word_count = word_count,
    sentiment_label = sentiment_label
  ))
}

# Apply sentiment analysis to each note
sentiment_results <- lapply(puzzle_data$notes, analyze_sentiment)

# Extract results into columns
puzzle_data <- puzzle_data %>%
  mutate(
    note_sentiment_score = sapply(sentiment_results, function(x) x$syuzhet_score),
    note_word_count = sapply(sentiment_results, function(x) x$word_count),
    note_sentiment_label = sapply(sentiment_results, function(x) x$sentiment_label),
    has_note = !is.na(notes) & notes != ""
  )

# Get NRC emotion scores
get_nrc_emotions <- function(text) {
  if (is.na(text) || text == "") {
    return(data.frame(
      anger = NA, anticipation = NA, disgust = NA, fear = NA,
      joy = NA, sadness = NA, surprise = NA, trust = NA
    ))
  }
  emotions <- get_nrc_sentiment(text)
  return(emotions[, c("anger", "anticipation", "disgust", "fear",
                      "joy", "sadness", "surprise", "trust")])
}

nrc_emotions <- bind_rows(lapply(puzzle_data$notes, get_nrc_emotions))

puzzle_data <- puzzle_data %>%
  mutate(
    note_anger = nrc_emotions$anger,
    note_anticipation = nrc_emotions$anticipation,
    note_disgust = nrc_emotions$disgust,
    note_fear = nrc_emotions$fear,
    note_joy = nrc_emotions$joy,
    note_sadness = nrc_emotions$sadness,
    note_surprise = nrc_emotions$surprise,
    note_trust = nrc_emotions$trust
  )

# Summary
sentiment_summary <- puzzle_data %>%
  filter(!is.na(note_sentiment_label)) %>%
  count(note_sentiment_label)
cat("Sentiment distribution:\n")
## Sentiment distribution:
print(sentiment_summary)
## # A tibble: 3 × 2
##   note_sentiment_label     n
##   <chr>                <int>
## 1 Negative                29
## 2 Neutral                 94
## 3 Positive                24

Final Dataset Summary

cat("Final dataset:", nrow(puzzle_data), "records,", ncol(puzzle_data), "columns\n")
## Final dataset: 157 records, 53 columns

Descriptive Statistics

Overall Time Distribution

time_stats <- puzzle_data %>%
  summarise(
    N = n(),
    Mean = round(mean(time_minutes), 2),
    Median = round(median(time_minutes), 2),
    SD = round(sd(time_minutes), 2),
    Min = round(min(time_minutes), 2),
    Max = round(max(time_minutes), 2),
    Q1 = round(quantile(time_minutes, 0.25), 2),
    Q3 = round(quantile(time_minutes, 0.75), 2)
  )

kable(time_stats, caption = "Completion Time Statistics (minutes)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Completion Time Statistics (minutes)
N Mean Median SD Min Max Q1 Q3
157 42.75 41.92 10.47 23.45 84.98 34.9 48.37
ggplot(puzzle_data, aes(x = time_minutes)) +
  geom_histogram(bins = 25, fill = "steelblue", color = "white", alpha = 0.8) +
  geom_vline(aes(xintercept = mean(time_minutes)), color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = median(time_minutes)), color = "orange", linetype = "dashed", size = 1) +
  labs(
    title = "Distribution of Puzzle Completion Times",
    subtitle = "Red = Mean, Orange = Median",
    x = "Time (minutes)",
    y = "Count"
  ) +
  theme(plot.title = element_text(face = "bold", size = 14))

By Paper Source (B vs J)

The “Whose Paper” column indicates whose physical newspaper was used for that puzzle session. Both Branden and James work together as a team on all puzzles - this field simply tracks which person brought the paper that day. In some cases, using James’s paper resulted in accidentally breaking the Sudoku in an unrecoverable way.

paper_stats <- puzzle_data %>%
  group_by(whose_paper) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    `SD (min)` = round(sd(time_minutes), 2),
    `Min (min)` = round(min(time_minutes), 2),
    `Max (min)` = round(max(time_minutes), 2),
    .groups = "drop"
  )

kable(paper_stats, caption = "Team Completion Time by Paper Source (B = Branden's paper, J = James's paper)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Team Completion Time by Paper Source (B = Branden’s paper, J = James’s paper)
whose_paper N Mean (min) Median (min) SD (min) Min (min) Max (min)
B 141 42.89 41.97 10.41 23.45 84.98
J 16 41.46 39.65 11.20 24.08 68.27
ggplot(puzzle_data, aes(x = whose_paper, y = time_minutes, fill = whose_paper)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3) +
  scale_fill_brewer(palette = "Set1") +
  labs(
    title = "Team Completion Time by Paper Source",
    subtitle = "B = Branden's paper, J = James's paper",
    x = "Paper Source",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none")


Time Trend Analysis

Are We Getting Better?

trend_model <- lm(time_minutes ~ date_numeric, data = puzzle_data)
trend_summary <- summary(trend_model)

ggplot(puzzle_data, aes(x = complete_date, y = time_minutes, color = whose_paper)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_smooth(method = "lm", se = TRUE, aes(group = 1), color = "black", linetype = "dashed") +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Team Completion Time Trend Over Time",
    subtitle = paste0("Overall trend: ", round(coef(trend_model)[2] * 30, 2),
                      " minutes/month (p = ", format(trend_summary$coefficients[2, 4], digits = 3), ")"),
    x = "Date",
    y = "Time (minutes)",
    color = "Paper Source"
  ) +
  theme(plot.title = element_text(face = "bold", size = 14))

trend_by_paper <- puzzle_data %>%
  group_by(whose_paper) %>%
  do({
    model <- lm(time_minutes ~ date_numeric, data = .)
    data.frame(
      `Monthly Change (min)` = round(coef(model)[2] * 30, 3),
      `p-value` = round(summary(model)$coefficients[2, 4], 4),
      Direction = ifelse(coef(model)[2] < 0, "Improving", "Getting Slower")
    )
  }) %>%
  ungroup()

kable(trend_by_paper, caption = "Time Trend by Paper Source") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Time Trend by Paper Source
whose_paper Monthly.Change..min. p.value Direction
B -0.292 0.0003 Improving
J -0.398 0.1267 Improving

Key Finding: The team is improving over time regardless of whose paper is used, with times decreasing by approximately 0.3-0.4 minutes per month.


Correlation Analysis

Numeric Predictors

numeric_cols <- puzzle_data %>%
  select(
    time_minutes,
    moon_illumination,
    weather_temp_mean_f, weather_precipitation_in, weather_wind_max_mph,
    sp500_daily_return_pct, djia_daily_return_pct, nasdaq_daily_return_pct,
    gold_daily_return_pct, oil_daily_return_pct,
    days_to_nearest_holiday, days_delay
  ) %>%
  drop_na()

correlations <- cor(numeric_cols)[, "time_minutes"]
correlations <- correlations[names(correlations) != "time_minutes"]
correlations <- sort(correlations, decreasing = TRUE)

cor_df <- data.frame(
  Variable = names(correlations),
  Correlation = round(correlations, 4),
  `Abs Correlation` = abs(round(correlations, 4))
) %>%
  arrange(desc(`Abs.Correlation`))

kable(cor_df, caption = "Correlations with Completion Time") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(abs(cor_df$Correlation) > 0.1), bold = TRUE, background = "#ffffcc")
Correlations with Completion Time
Variable Correlation Abs.Correlation
weather_wind_max_mph weather_wind_max_mph 0.1629 0.1629
weather_precipitation_in weather_precipitation_in 0.0975 0.0975
gold_daily_return_pct gold_daily_return_pct -0.0698 0.0698
moon_illumination moon_illumination -0.0698 0.0698
djia_daily_return_pct djia_daily_return_pct -0.0551 0.0551
days_delay days_delay 0.0502 0.0502
sp500_daily_return_pct sp500_daily_return_pct -0.0460 0.0460
nasdaq_daily_return_pct nasdaq_daily_return_pct -0.0244 0.0244
oil_daily_return_pct oil_daily_return_pct 0.0122 0.0122
days_to_nearest_holiday days_to_nearest_holiday -0.0112 0.0112
weather_temp_mean_f weather_temp_mean_f -0.0099 0.0099
cor_matrix <- cor(numeric_cols, use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper",
         tl.cex = 0.8, tl.col = "black",
         addCoef.col = "black", number.cex = 0.7,
         col = colorRampPalette(c("#6D9EC1", "white", "#E46726"))(200))

Key Finding: Wind speed shows the strongest correlation with completion time (r = 0.16). Windier days = slower puzzle times!


Categorical Variable Analysis

Day of Week

day_stats <- puzzle_data %>%
  group_by(day_of_week) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    `SD (min)` = round(sd(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(day_stats, caption = "Completion Time by Day of Week") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Completion Time by Day of Week
day_of_week N Mean (min) Median (min) SD (min)
Saturday 3 35.71 32.05 7.50
Wednesday 125 41.49 40.67 9.67
Friday 8 42.39 43.21 7.62
Tuesday 6 51.18 50.25 9.05
Thursday 10 51.38 46.52 15.35
Monday 5 51.65 50.10 11.48
day_order <- puzzle_data %>%
  group_by(day_of_week) %>%
  summarise(mean_time = mean(time_minutes)) %>%
  arrange(mean_time) %>%
  pull(day_of_week)

ggplot(puzzle_data, aes(x = factor(day_of_week, levels = day_order), y = time_minutes, fill = day_of_week)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Completion Time by Day of Week",
    subtitle = "ANOVA p = 0.003 (Significant!)",
    x = "Day",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_brewer(palette = "Set3")

Key Finding: Day of the week significantly affects completion time (p = 0.003). Completing on the paper day (Wednesday) is faster than delaying!

Same Day Completion

same_day_stats <- puzzle_data %>%
  group_by(completed_same_day) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  ) %>%
  mutate(completed_same_day = ifelse(completed_same_day, "Same Day", "Delayed"))

kable(same_day_stats, caption = "Same Day vs Delayed Completion") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Same Day vs Delayed Completion
completed_same_day N Mean (min) Median (min)
Delayed 36 46.72 44.33
Same Day 121 41.57 40.67
ggplot(puzzle_data, aes(x = completed_same_day, y = time_minutes, fill = completed_same_day)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.2) +
  scale_fill_manual(values = c("TRUE" = "#2ecc71", "FALSE" = "#e74c3c")) +
  labs(
    title = "Same Day vs Delayed Completion",
    subtitle = "ANOVA p = 0.009 (Significant!)",
    x = "Completed Same Day",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none")

Key Finding: Completing puzzles on the same day as the paper is significantly faster (41.6 min vs 46.7 min, p = 0.009).

Zodiac Sign

zodiac_stats <- puzzle_data %>%
  group_by(zodiac_sign) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(zodiac_stats, caption = "Completion Time by Zodiac Sign") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Completion Time by Zodiac Sign
zodiac_sign N Mean (min) Median (min)
Leo 14 40.32 38.14
Sagittarius 13 40.33 41.63
Capricorn 13 40.85 34.67
Libra 13 41.39 41.92
Taurus 11 42.02 44.00
Gemini 13 42.15 40.05
Aries 13 42.44 40.07
Pisces 14 43.01 44.12
Cancer 14 43.24 42.56
Scorpio 13 44.01 42.08
Virgo 13 44.40 37.82
Aquarius 13 48.85 46.77
ggplot(puzzle_data, aes(x = reorder(zodiac_sign, time_minutes), y = time_minutes)) +
  geom_boxplot(fill = "#9b59b6", alpha = 0.6) +
  coord_flip() +
  labs(
    title = "Completion Time by Zodiac Sign",
    subtitle = "ANOVA p = 0.80 (Not Significant)",
    x = "Zodiac Sign",
    y = "Time (minutes)"
  )

Moon Phase

moon_stats <- puzzle_data %>%
  group_by(moon_phase_name) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(moon_stats, caption = "Completion Time by Moon Phase") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Completion Time by Moon Phase
moon_phase_name N Mean (min) Median (min)
New Moon 8 40.57 38.72
Waxing Gibbous 37 41.24 41.52
Last Quarter 6 41.40 38.40
Waning Gibbous 34 41.82 40.24
Full Moon 3 42.61 45.25
Waning Crescent 28 44.20 44.52
Waxing Crescent 40 44.40 43.13
First Quarter 1 49.37 49.37
ggplot(puzzle_data, aes(x = reorder(moon_phase_name, time_minutes), y = time_minutes)) +
  geom_boxplot(fill = "#34495e", alpha = 0.6) +
  coord_flip() +
  labs(
    title = "Completion Time by Moon Phase",
    subtitle = "ANOVA p = 0.85 (Not Significant)",
    x = "Moon Phase",
    y = "Time (minutes)"
  )


Weather Impact

Temperature

temp_stats <- puzzle_data %>%
  filter(!is.na(temp_category)) %>%
  group_by(temp_category) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(temp_stats, caption = "Completion Time by Temperature") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Completion Time by Temperature
temp_category N Mean (min) Median (min)
Warm (70-85F) 55 41.08 38.38
Mild (50-70F) 52 42.73 43.46
Cold (<50F) 23 43.70 40.25
Hot (85F+) 27 45.37 43.82
ggplot(puzzle_data %>% filter(!is.na(weather_temp_mean_f)),
       aes(x = weather_temp_mean_f, y = time_minutes)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(
    title = "Completion Time vs Temperature",
    x = "Mean Temperature (F)",
    y = "Time (minutes)"
  )

Wind Speed

ggplot(puzzle_data %>% filter(!is.na(weather_wind_max_mph)),
       aes(x = weather_wind_max_mph, y = time_minutes)) +
  geom_point(alpha = 0.5, color = "#27ae60") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(
    title = "Completion Time vs Wind Speed",
    subtitle = paste("Correlation r =", round(cor(puzzle_data$time_minutes,
                                                   puzzle_data$weather_wind_max_mph,
                                                   use = "complete.obs"), 3)),
    x = "Max Wind Speed (mph)",
    y = "Time (minutes)"
  )

Key Finding: Higher wind speeds are associated with slower completion times (r = 0.16).


Holiday Proximity

holiday_stats <- puzzle_data %>%
  group_by(holiday_proximity) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  )

kable(holiday_stats, caption = "Completion Time by Holiday Proximity") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Completion Time by Holiday Proximity
holiday_proximity N Mean (min) Median (min)
Within 3 days 21 42.64 42.98
4-7 days 22 42.67 40.88
8-14 days 42 44.08 43.64
More than 14 days 72 42.03 40.59
ggplot(puzzle_data, aes(x = holiday_proximity, y = time_minutes, fill = holiday_proximity)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Completion Time by Holiday Proximity",
    subtitle = "ANOVA p = 0.80 (Not Significant)",
    x = "Days to Nearest Holiday",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none") +
  scale_fill_brewer(palette = "Blues")

Finding: Holiday proximity does not significantly affect completion time.


Stock Market Impact

market_stats <- puzzle_data %>%
  filter(!is.na(market_direction)) %>%
  group_by(market_direction) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(market_stats, caption = "Completion Time by S&P 500 Direction") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Completion Time by S&P 500 Direction
market_direction N Mean (min) Median (min)
Slightly Up 69 41.40 41.33
Down >1% 15 42.64 42.08
Up >1% 20 44.36 43.97
Slightly Down 49 44.55 42.48
ggplot(puzzle_data %>% filter(!is.na(market_direction)),
       aes(x = market_direction, y = time_minutes, fill = market_direction)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Completion Time by S&P 500 Performance",
    subtitle = "ANOVA p = 0.39 (Not Significant)",
    x = "Market Direction",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none") +
  scale_fill_brewer(palette = "RdYlGn")

Finding: Stock market performance does not significantly affect completion time.


Sentiment Analysis of Notes

Sentiment Distribution

sentiment_dist <- puzzle_data %>%
  filter(!is.na(note_sentiment_label)) %>%
  count(note_sentiment_label) %>%
  mutate(Percentage = round(n / sum(n) * 100, 1))

kable(sentiment_dist, col.names = c("Sentiment", "Count", "Percentage"),
      caption = "Distribution of Note Sentiments") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Distribution of Note Sentiments
Sentiment Count Percentage
Negative 29 19.7
Neutral 94 63.9
Positive 24 16.3
ggplot(sentiment_dist, aes(x = "", y = n, fill = note_sentiment_label)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  scale_fill_manual(values = c("Negative" = "#e74c3c", "Neutral" = "#95a5a6", "Positive" = "#2ecc71")) +
  labs(title = "Sentiment Distribution in Notes", fill = "Sentiment") +
  theme_void() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

Sentiment vs Completion Time

sentiment_time <- puzzle_data %>%
  filter(!is.na(note_sentiment_label)) %>%
  group_by(note_sentiment_label) %>%
  summarise(
    N = n(),
    `Mean (min)` = round(mean(time_minutes), 2),
    `Median (min)` = round(median(time_minutes), 2),
    `SD (min)` = round(sd(time_minutes), 2),
    .groups = "drop"
  ) %>%
  arrange(`Mean (min)`)

kable(sentiment_time, caption = "Completion Time by Note Sentiment") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Completion Time by Note Sentiment
note_sentiment_label N Mean (min) Median (min) SD (min)
Positive 24 40.72 40.39 10.77
Neutral 94 41.88 41.42 10.61
Negative 29 46.81 47.12 10.11
ggplot(puzzle_data %>% filter(!is.na(note_sentiment_label)),
       aes(x = note_sentiment_label, y = time_minutes, fill = note_sentiment_label)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3) +
  scale_fill_manual(values = c("Negative" = "#e74c3c", "Neutral" = "#95a5a6", "Positive" = "#2ecc71")) +
  labs(
    title = "Completion Time by Note Sentiment",
    subtitle = paste("ANOVA p =", round(summary(aov(time_minutes ~ note_sentiment_label,
                                                     data = puzzle_data %>%
                                                       filter(!is.na(note_sentiment_label))))[[1]][["Pr(>F)"]][1], 4)),
    x = "Sentiment",
    y = "Time (minutes)"
  ) +
  theme(legend.position = "none")

Sentiment Score vs Time

sentiment_cor <- cor(puzzle_data$time_minutes, puzzle_data$note_sentiment_score, use = "complete.obs")

ggplot(puzzle_data %>% filter(!is.na(note_sentiment_score)),
       aes(x = note_sentiment_score, y = time_minutes)) +
  geom_point(alpha = 0.5, aes(color = note_sentiment_label)) +
  geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed") +
  scale_color_manual(values = c("Negative" = "#e74c3c", "Neutral" = "#95a5a6", "Positive" = "#2ecc71")) +
  labs(
    title = "Completion Time vs Sentiment Score",
    subtitle = paste("Correlation r =", round(sentiment_cor, 3)),
    x = "Sentiment Score (negative = frustrated, positive = happy)",
    y = "Time (minutes)",
    color = "Sentiment"
  )

NRC Emotion Analysis

The NRC lexicon categorizes words into 8 emotions: anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.

emotion_cols <- c("note_anger", "note_anticipation", "note_disgust", "note_fear",
                  "note_joy", "note_sadness", "note_surprise", "note_trust")

emotion_cors <- sapply(emotion_cols, function(col) {
  cor(puzzle_data$time_minutes, puzzle_data[[col]], use = "complete.obs")
})

emotion_cor_df <- data.frame(
  Emotion = gsub("note_", "", names(emotion_cors)),
  Correlation = round(emotion_cors, 4)
) %>%
  arrange(desc(abs(Correlation)))

kable(emotion_cor_df, caption = "Emotion Correlations with Completion Time") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(abs(emotion_cor_df$Correlation) > 0.05), bold = TRUE, background = "#ffffcc")
Emotion Correlations with Completion Time
Emotion Correlation
note_anger anger 0.2270
note_anticipation anticipation -0.1210
note_fear fear 0.0831
note_sadness sadness 0.0722
note_joy joy -0.0412
note_disgust disgust -0.0393
note_trust trust 0.0226
note_surprise surprise 0.0123
emotion_cor_df$Emotion <- factor(emotion_cor_df$Emotion, levels = emotion_cor_df$Emotion)

ggplot(emotion_cor_df, aes(x = reorder(Emotion, Correlation), y = Correlation,
                            fill = ifelse(Correlation > 0, "Positive", "Negative"))) +
  geom_bar(stat = "identity", alpha = 0.8) +
  coord_flip() +
  scale_fill_manual(values = c("Positive" = "#e74c3c", "Negative" = "#2ecc71")) +
  labs(
    title = "Emotion Correlations with Completion Time",
    subtitle = "Positive correlation = emotion associated with slower times",
    x = "Emotion",
    y = "Correlation with Time"
  ) +
  theme(legend.position = "none") +
  geom_hline(yintercept = 0, linetype = "dashed")

Sample Notes by Sentiment

Note. For the public usage of this, I have removed my notes because at times they can get quite colorful.

Word Cloud of Notes

# Get all words from notes
all_words <- puzzle_data %>%
  filter(!is.na(notes)) %>%
  select(notes) %>%
  unnest_tokens(word, notes) %>%
  anti_join(stop_words, by = "word") %>%
  count(word, sort = TRUE) %>%
  filter(n > 1)

# Create word cloud
if (nrow(all_words) > 0) {
  wordcloud(words = all_words$word, freq = all_words$n,
            min.freq = 2, max.words = 100,
            random.order = FALSE, rot.per = 0.35,
            colors = brewer.pal(8, "Dark2"))
}


Regression Analysis

Multiple Regression Model

# Note: whose_paper represents paper source (whose newspaper was used), not who completed the puzzles
# The team works together on all puzzles - this variable tests if paper source affects completion time
regression_data <- puzzle_data %>%
  filter(!is.na(weather_temp_mean_f) & !is.na(sp500_daily_return_pct)) %>%
  mutate(
    paper_source = factor(whose_paper),  # Renamed for clarity
    day_of_week = factor(day_of_week)
  )

full_model <- lm(
  time_minutes ~
    paper_source +
    days_delay +
    weather_temp_mean_f +
    weather_precipitation_in +
    weather_wind_max_mph +
    moon_illumination +
    sp500_daily_return_pct +
    days_to_nearest_holiday +
    date_numeric,
  data = regression_data
)

model_summary <- summary(full_model)

cat("R-squared:", round(model_summary$r.squared, 4), "\n")
## R-squared: 0.1127
cat("Adjusted R-squared:", round(model_summary$adj.r.squared, 4), "\n")
## Adjusted R-squared: 0.0568
cat("F-statistic p-value:", format(pf(model_summary$fstatistic[1],
                                       model_summary$fstatistic[2],
                                       model_summary$fstatistic[3],
                                       lower.tail = FALSE), digits = 4), "\n")
## F-statistic p-value: 0.04138
coef_table <- tidy(full_model) %>%
  mutate(
    estimate = round(estimate, 4),
    std.error = round(std.error, 4),
    statistic = round(statistic, 3),
    p.value = round(p.value, 4),
    Significant = ifelse(p.value < 0.05, "**", ifelse(p.value < 0.1, "*", ""))
  ) %>%
  arrange(p.value)

kable(coef_table, caption = "Multiple Regression Coefficients") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(coef_table$p.value < 0.05), bold = TRUE, background = "#d4edda") %>%
  row_spec(which(coef_table$p.value >= 0.05 & coef_table$p.value < 0.1), background = "#fff3cd")
Multiple Regression Coefficients
term estimate std.error statistic p.value Significant
(Intercept) 47.3919 5.4921 8.629 0.0000 **
date_numeric -0.0092 0.0027 -3.349 0.0010 **
weather_precipitation_in 1.9363 2.0644 0.938 0.3499
moon_illumination -2.0632 2.4140 -0.855 0.3942
paper_sourceJ -2.0730 2.7446 -0.755 0.4513
weather_wind_max_mph 0.1402 0.2115 0.663 0.5084
sp500_daily_return_pct -0.2932 0.7567 -0.387 0.6990
days_delay 0.0013 0.0041 0.330 0.7419
days_to_nearest_holiday -0.0195 0.0706 -0.275 0.7834
weather_temp_mean_f 0.0014 0.0549 0.025 0.9800

Random Forest Feature Importance

# Note: paper_source represents whose newspaper was used, not who completed the puzzles
rf_data <- puzzle_data %>%
  select(
    time_minutes, whose_paper, days_delay,
    weather_temp_mean_f, weather_precipitation_in, weather_wind_max_mph,
    moon_illumination, sp500_daily_return_pct, djia_daily_return_pct,
    nasdaq_daily_return_pct, days_to_nearest_holiday, date_numeric,
    gold_daily_return_pct, oil_daily_return_pct
  ) %>%
  rename(paper_source = whose_paper) %>%  # Renamed for clarity
  drop_na() %>%
  mutate(paper_source = factor(paper_source))

set.seed(42)
rf_model <- randomForest(time_minutes ~ ., data = rf_data, ntree = 500, importance = TRUE)

importance_df <- importance(rf_model) %>%
  as.data.frame() %>%
  rownames_to_column("Variable") %>%
  arrange(desc(`%IncMSE`)) %>%
  mutate(
    `%IncMSE` = round(`%IncMSE`, 2),
    IncNodePurity = round(IncNodePurity, 2)
  )

kable(importance_df, caption = "Random Forest Feature Importance") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Random Forest Feature Importance
Variable %IncMSE IncNodePurity
date_numeric 1.44 2359.46
sp500_daily_return_pct 0.60 1137.80
paper_source 0.16 132.00
weather_wind_max_mph 0.05 1526.89
days_delay -0.38 789.55
moon_illumination -0.53 1520.78
djia_daily_return_pct -0.82 1001.26
days_to_nearest_holiday -2.66 988.86
weather_precipitation_in -3.06 777.06
oil_daily_return_pct -3.22 1070.87
nasdaq_daily_return_pct -3.81 1133.81
weather_temp_mean_f -4.39 1575.64
gold_daily_return_pct -4.65 1191.18
varImpPlot(rf_model, main = "Random Forest Feature Importance")

Key Finding: The most important predictor is date_numeric (time trend), confirming that completion times are improving over time.


Record Times

fastest <- puzzle_data %>% slice_min(time_minutes, n = 5)
slowest <- puzzle_data %>% slice_max(time_minutes, n = 5)

kable(
  fastest %>%
    select(complete_date, whose_paper, time_minutes, zodiac_sign, weather_description) %>%
    rename(`Paper Source` = whose_paper),
  caption = "Top 5 Fastest Team Times"
) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Top 5 Fastest Team Times
complete_date Paper Source time_minutes zodiac_sign weather_description
2024-06-05 B 23.45000 Gemini Slight rain
2025-02-05 J 24.08333 Aquarius Overcast
2025-09-10 B 24.40000 Virgo Overcast
2024-08-21 B 24.85000 Leo Overcast
2024-06-12 B 26.76667 Gemini Moderate drizzle
kable(
  slowest %>%
    select(complete_date, whose_paper, time_minutes, zodiac_sign, weather_description, notes) %>%
    rename(`Paper Source` = whose_paper),
  caption = "Top 5 Slowest Team Times"
) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Top 5 Slowest Team Times
complete_date Paper Source time_minutes zodiac_sign weather_description notes
2023-01-26 B 84.98333 Aquarius Mainly clear Bert showed up and distracted everyone.
2023-09-20 B 68.86667 Virgo Light drizzle Bert wasn’t even here. 
2023-05-24 J 68.26667 Gemini Slight rain Fuckin Shortz
2025-01-02 B 67.66667 Capricorn Clear sky Yeah, not a great start. Sudoku and NYT killed us.
2025-07-14 B 64.33333 Cancer Heavy rain 5am flight from MIA. G left the paper on the lawn.

Summary & Conclusions

Statistically Significant Findings

Factor Effect p-value
Time Trend Improving ~0.3 min/month < 0.001
Day of Week Wednesday is fastest 0.003
Same Day Completion 5 minutes faster 0.009
Wind Speed Higher wind = slower r = 0.16

Sentiment Analysis Findings

# Calculate sentiment stats for summary
sentiment_summary_stats <- puzzle_data %>%
  filter(!is.na(note_sentiment_label)) %>%
  group_by(note_sentiment_label) %>%
  summarise(mean_time = mean(time_minutes), .groups = "drop")

sentiment_aov_p <- summary(aov(time_minutes ~ note_sentiment_label,
                               data = puzzle_data %>%
                                 filter(!is.na(note_sentiment_label))))[[1]][["Pr(>F)"]][1]
  • Note Sentiment shows trends in completion time (p = 0.057)
  • Negative notes (frustration) often correlate with harder puzzles or mistakes
  • Most common emotions detected: check the emotion analysis above

Not Significant

  • Holiday proximity (p = 0.80)
  • Zodiac sign (p = 0.80)
  • Moon phase (p = 0.85)
  • Temperature (p = 0.35)
  • Stock market (p = 0.39)
  • Paper source (B vs J) (p = 0.61) - no difference based on whose newspaper was used

Recommendations

  1. Complete puzzles on Wednesday - Don’t delay!
  2. Avoid windy days if you want a faster time
  3. Keep practicing - The team is getting better!
  4. Either paper works - No advantage to using Branden’s vs James’s newspaper

Appendix: Full Dataset

Note: “whose_paper” indicates whose physical newspaper was used (B = Branden’s paper, J = James’s paper). All puzzles were completed as a team effort by Branden and James working together.

DT::datatable(
  puzzle_data %>%
#    select(paper_date, complete_date, time_minutes, whose_paper,
#           zodiac_sign, moon_phase_name, weather_description,
#           sp500_daily_return_pct, notes) %>%
    select(paper_date, complete_date, time_minutes, whose_paper,
           zodiac_sign, moon_phase_name, weather_description,
           sp500_daily_return_pct) %>%
    mutate(time_minutes = round(time_minutes, 2),
           sp500_daily_return_pct = round(sp500_daily_return_pct, 2)) %>%
    rename(`Paper Source` = whose_paper),
  options = list(pageLength = 10, scrollX = TRUE),
  caption = "Full Puzzle Wednesday Dataset (Team Completion Times)"
)

Report generated on 2026-01-24 10:45:54.601529