pacman::p_load(dplyr, stringr, kableExtra)
load_data = function (f) read.csv(file.path(here::here("data"), f))
orders = load_data("noahs-orders.csv")
orders_items = load_data("noahs-orders_items.csv")
products = load_data("noahs-products.csv")
customers = load_data("noahs-customers.csv")Solving the Hanukkah of Data Puzzles
R
Python
Coding Puzzles
Code solutions for the Hanukkah of Data challenge.
The Hanukkah of Data provides eight short daily coding puzzles during Hanukkah. I found this set of puzzles to be a great for improving skills in data wrangling, I initially wrote my solutions in R but these puzzles served as a good practice set for getting some reps in with data cleaning in Python.
The puzzle solutions can be a little repetitive, however the solutions are not immediately apparent from the prompts, and I found myself looking forward to solving the riddles each morning. With each puzzle you unlock more of the artwork in Noah’s Rug.
Spoiler warning – Below are my solutions to the puzzles.

Lets read in the data, load our packages, and do some pre-processing.
import pandas as pd
orders = pd.read_csv("data/noahs-orders.csv")
orders_items = pd.read_csv("data/noahs-orders_items.csv")
products = pd.read_csv("data/noahs-products.csv")
customers = pd.read_csv("data/noahs-customers.csv")
data = (
customers
.merge(orders, on = "customerid")
.merge(orders_items, on = "orderid")
.merge(products, on = "sku")
)customers |>
mutate(name = gsub(" jr.| iii| v| ii| iv | i", "", tolower(name)),
last_name = str_extract(name, "\\w+$"),
name_number = str_replace_all(last_name, "a|b|c", "2") |>
str_replace_all("d|e|f", "3") |>
str_replace_all("g|h|i", "4") |>
str_replace_all("j|k|l", "5") |>
str_replace_all("m|n|o", "6") |>
str_replace_all("p|q|r|s", "7") |>
str_replace_all("t|u|v", "8") |>
str_replace_all("w|x|y|z", "9"),
phone_number = gsub("-", "", phone)) |>
filter(name_number == phone_number) |>
select(name, phone) name phone
1 sam tannenbaum 826-636-2286
letter_to_phone = {
"a": "2", "b": "2", "c": "2",
"d": "3", "e": "3", "f": "3",
"g": "4", "h": "4", "i": "4",
"j": "5", "k": "5", "l": "5",
"m": "6", "n": "6", "o": "6",
"p": "7", "q": "7", "r": "7", "s": "7",
"t": "8", "u": "8", "v": "8",
"w": "9", "x": "9", "y": "9", "z": "9"
}
for index, row in customers.iterrows():
name = row['name']
phone = row['phone']
lname = (
name
.lower()
.replace("jr.| iii| v| ii| iv | i", "")
.split(" ")[-1]
)
for word, replacement in letter_to_phone.items():
lname = lname.replace(word, replacement)
if str(lname) == str(phone).replace("-", ""):
print(f"Solution:\nName: {name}\nPhone: {phone}")Solution:
Name: Sam Tannenbaum
Phone: 826-636-2286
bagel_skus = products |>
filter(grepl("bagel", tolower(desc))) |>
pull(sku)
bagel_order_ids = orders_items |>
filter(sku %in% bagel_skus)
bagel_order_customers = orders |>
mutate(year = lubridate::year(shipped)) |>
filter(orderid %in% bagel_order_ids$orderid,
year == 2017) |>
pull(customerid)
customers |>
filter(customerid %in% bagel_order_customers) |>
mutate(name = gsub(" jr.| iii| v| ii| iv | i", "", tolower(name)),
last_name = str_extract(name, "\\w+$"),
first_name = str_extract(name, "^\\w+"),
initals = paste0(str_sub(first_name, 1, 1),
str_sub(last_name, 1, 1))) |>
filter(initals == "jp") |>
select(name, phone) name phone
1 joshua peterson 332-274-4185
cust = data[data["desc"].str.contains("bagel", case = False) &
data["shipped"].str.contains("2017")]
for index, row in cust.iterrows():
name = row['name']
phone = row['phone']
names = (
name
.lower()
.replace("jr.| iii| v| ii| iv | i", "")
.split(" ")
)
initals = names[0][0] + names[-1][0]
if initals == "jp":
print(f"Solution:\nName: {name}\nPhone: {phone}")Solution:
Name: Joshua Peterson
Phone: 332-274-4185
rabbit_years = c(1939, 1951, 1963, 1975, 1987, 1999)
neighborhood = customers |>
filter(phone == "332-274-4185") |>
pull("citystatezip")
customers |>
mutate(year = lubridate::year(birthdate),
month = lubridate::month(birthdate),
day = lubridate::day(birthdate)) |>
filter(year %in% rabbit_years,
month == 6 & day >= 21 | month == 7 & day <= 22,
citystatezip == neighborhood) |>
select(name, phone) name phone
1 Robert Morton 917-288-9635
rabbit_years = [1939, 1951, 1963, 1975, 1987, 1999]
neighborhood = customers[customers["phone"] == "332-274-4185"]["citystatezip"].iloc[0]
data["birthdate"] = pd.to_datetime(data["birthdate"])
birth_month = data["birthdate"].dt.month
birth_day = data["birthdate"].dt.day
birth_year = data["birthdate"].dt.year
june_f = (birth_month == 6) & (birth_day >= 21)
july_f = (birth_month == 7) & (birth_day <= 22)
rabbit_year_f = birth_year.isin(rabbit_years)
neighborhood_f = data["citystatezip"] == neighborhood
conditions = (june_f | july_f) & rabbit_year_f & neighborhood_f
# Apply the filter
data[conditions][["name", "phone"]] name phone
198918 Robert Morton 917-288-9635
bakery_skus = products |>
filter(grepl("bky", tolower(sku))) |>
filter(!grepl("bagel", tolower(desc)))
bakery_order_items = orders_items |>
filter(sku %in% bakery_skus$sku,
qty > 1) |>
pull("orderid")
bakery_orders_customers = orders |>
filter(orderid %in% bakery_order_items,
lubridate::hour(shipped) < 5,
ordered == shipped)
customers |>
filter(customerid %in% bakery_orders_customers$customerid) |>
select(name, phone) name phone
1 Renee Harmon 607-231-3605
data["shipped"] = pd.to_datetime(data["shipped"])
data["ordered"] = pd.to_datetime(data["ordered"])
sku_f = data["sku"].str.contains("bky", case=False)
hour_f = data["shipped"].dt.hour < 5
in_store_f = data["shipped"] == data["ordered"]
qty_f = data["qty"] > 1
conditions = sku_f & hour_f & in_store_f & qty_f
data[conditions][["name", "phone"]].drop_duplicates() name phone
18500 Renee Harmon 607-231-3605
old_cat_food = products |>
filter(grepl("senior cat", tolower(desc))) |>
pull("sku")
cat_order_items = orders_items |>
filter(sku %in% old_cat_food,
qty == 10) |>
pull("orderid")
orders |>
filter(orderid %in% cat_order_items) |>
left_join(customers, by = "customerid") |>
select(name, phone) |>
unique() name phone
1 Nicole Wilson 631-507-6048
cat_desc = data["desc"].str.contains("senior cat", case=False)
cat_qty = data["qty"] == 10
data[cat_desc & cat_qty][["name", "phone"]].drop_duplicates() name phone
6140 Nicole Wilson 631-507-6048
# check which prices are offered below wholesale cost
orders_below_cost = orders_items |>
left_join(products |> select(wholesale_cost, sku),
by = "sku") |>
mutate(sale = unit_price < wholesale_cost) |>
filter(sale) |>
pull(orderid)
orders |>
filter(orderid %in% orders_below_cost) |>
group_by(customerid) |>
summarise(num_purchases = n()) |>
arrange(desc(num_purchases)) |>
left_join(customers |> select(customerid, name, phone), by = "customerid") |>
filter(num_purchases == max(num_purchases)) |>
select(name, phone)# A tibble: 1 × 2
name phone
<chr> <chr>
1 Sherri Long 585-838-9161
below_cost = data["unit_price"] < data["wholesale_cost"]
counts = (
data[below_cost]
.groupby(["name", "phone"])
.agg(n_below_cost = pd.NamedAgg(column="orderid", aggfunc="nunique"))
)
counts[counts["n_below_cost"] == counts["n_below_cost"].max()] n_below_cost
name phone
Sherri Long 585-838-9161 31
day_6_phone = "585-838-9161"
past_customer = customers |>
filter(phone == day_6_phone) |>
pull(customerid)
color_purchases = products |>
left_join(orders_items |> select(orderid, sku), by = "sku") |>
left_join(orders |> select(orderid, customerid,
ordered, shipped),
by = "orderid") |>
filter(grepl("col", tolower(sku)),
ordered == shipped) |>
mutate(item = str_extract(desc, "\\w+\\s\\w+"),
colour = str_extract(desc, "\\(\\w+\\)"),
colour = gsub("\\(|\\)", "", colour))
gf_purchases = color_purchases |>
filter(customerid == past_customer,
grepl("\\(", desc))
search_for_purchases = function(dat) {
gf_item = dat$item
gf_colour = dat$colour
gf_time = dat$ordered
out = color_purchases |>
filter(item == gf_item & colour != gf_colour,
abs(difftime(ordered, gf_time, units = "mins")) < 10)
return(out)
}
# loop through girlfriend's purchases
lapply(1:nrow(gf_purchases), function(i) {
search_for_purchases(gf_purchases[i, ])
}) |>
bind_rows() |>
select(customerid) |>
left_join(customers, by = "customerid") |>
select(customerid, name, phone) customerid name phone
1 5783 Carlos Myers 838-335-7157
day_6_phone = "585-838-9161"
data["color"] = data["desc"].str.extract(r"\((\w+)\)")
data["item"] = data["desc"].str.extract(r"(\w+\s\w+)")
data["ordered"] = pd.to_datetime(data["ordered"])
color_data = data[data["sku"].str.contains("col", case=False)]
gf_items = color_data[color_data["phone"] == day_6_phone]
for color, item, time in zip(gf_items["color"],
gf_items["item"],
gf_items["ordered"]):
time_delta = color_data["ordered"] - time
diff_color = color_data["color"] != color
same_item = color_data["item"] == item
same_time = abs(time_delta) < pd.Timedelta("10 minutes")
candidate = same_time & same_item & diff_color
if candidate.any():
color_data[candidate][["name", "phone"]].drop_duplicates() name phone
324740 Carlos Myers 838-335-7157
# find customer with the most collectible (col) purchases
products |>
filter(grepl("col", tolower(sku))) |>
left_join(orders_items |> select(orderid, sku), by = "sku") |>
left_join(orders |> select(orderid, customerid), by = "orderid") |>
group_by(customerid) |>
tally() |>
left_join(customers |> select(customerid, name, phone), by = "customerid") |>
arrange(desc(n)) |>
filter(n == max(n)) |>
select(name, phone)# A tibble: 1 × 2
name phone
<chr> <chr>
1 James Smith 212-547-3518
collectables = data[data["sku"].str.contains("col", case=False)]
(
collectables
.groupby(["name", "phone"])
.agg(n_collectables=pd.NamedAgg(column="orderid", aggfunc="nunique"))
.sort_values("n_collectables", ascending=False)
.head(1)
) n_collectables
name phone
James Smith 212-547-3518 109