Show the code
import polars as pl
import requests #new for reading in JSON
import io #new for reading in JSON
import numpy as np
from lets_plot import *
LetsPlot.setup_html(isolated_frame=True)import polars as pl
import requests #new for reading in JSON
import io #new for reading in JSON
import numpy as np
from lets_plot import *
LetsPlot.setup_html(isolated_frame=True)# Learn morea about Code Cells: https://quarto.org/docs/reference/cells/cells-jupyter.html
# Include and execute your code here
url = "https://raw.githubusercontent.com/byuidatascience/data4missing/master/data-raw/flights_missing/flights_missing.json"
response = requests.get(url)
df = pl.read_json(io.BytesIO(response.content))No discussion needed for this question
# Include and execute your code here
df = (
df
.with_columns(
pl.when(pl.col("month").is_in(["January", "Febuary", "March"]))
.then(pl.lit("Winter"))
.when(pl.col("month").is_in(["April", "May", "June"]))
.then(pl.lit("Spring"))
.when(pl.col("month").is_in(["July", "August", "September"]))
.then(pl.lit("Summer"))
.when(pl.col("month").is_in(["October", "November", "December"]))
.then(pl.lit("Fall"))
.otherwise(pl.lit("Unknown"))
.alias("season")
)
)
df.select(["month", "season"]).head()| month | season |
|---|---|
| str | str |
| "January" | "Winter" |
| "January" | "Winter" |
| "January" | "Winter" |
| "January" | "Winter" |
| "January" | "Winter" |
No discussion needed for this question
# Include and execute your code here
num_of_columns = [col for col in df.columns if col.startswith("num_of")]
df_clean = (
df
.filter(
pl.all_horizontal([
~pl.col(col).cast(pl.Utf8).str.contains(r"\+")
for col in num_of_columns
])
)
.with_columns([
pl.col(col).cast(pl.Float64)
for col in num_of_columns
])
.filter(
pl.all_horizontal([
pl.col(col) != -999
for col in num_of_columns
])
)
.with_columns([
(pl.col(col) / 1000).alias(col)
for col in num_of_columns
])
)
df_clean.select(num_of_columns).head()| num_of_flights_total | num_of_delays_carrier | num_of_delays_late_aircraft | num_of_delays_nas | num_of_delays_security | num_of_delays_weather | num_of_delays_total |
|---|---|---|---|---|---|---|
| f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| 12.687 | 1.041 | 0.928 | 0.935 | 0.011 | 0.233 | 3.153 |
| 12.381 | 0.414 | 1.058 | 0.895 | 0.004 | 0.061 | 2.43 |
| 28.194 | 1.197 | 2.255 | 5.415 | 0.005 | 0.306 | 9.178 |
| 7.283 | 0.572 | 0.68 | 0.638 | 0.007 | 0.056 | 1.952 |
| 10.274 | 0.798 | 0.733 | 1.166 | 0.007 | 0.114 | 2.816 |
According to the BTS website, the “Weather” category only accounts for severe weather delays. Mild weather delays are not counted in the “Weather” category, but are actually included in both the “nas” and “late_aircraft” categories. Using the columns you modified in the bullet above, your job is to create a new column that calculates the total number of flights (in 1,000’s) delayed by weather (both severe and mild). Show your work by printing the first 5 rows of the dataset. Use these three rules for your calculations:
No discussion needed for this question
# Include and execute your code here
df_weather = (
df_clean
.with_columns(
(
pl.col("num_of_delays_weather")
+ (pl.col("num_of_delays_late_aircraft") * 0.30)
+ (
pl.when(pl.col("month").is_in(["April", "May", "June", "July", "August"]))
.then(pl.col("num_of_delays_nas") * 0.40)
.otherwise(pl.col("num_of_delays_nas") * 0.65)
)
).alias("weather_delay_total")
)
)
df_weather.select([
"airport_code",
"airport_name",
"year",
"month",
"season",
"num_of_delays_weather",
"num_of_delays_late_aircraft",
"num_of_delays_nas",
"weather_delay_total"
]).head(5)| airport_code | airport_name | year | month | season | num_of_delays_weather | num_of_delays_late_aircraft | num_of_delays_nas | weather_delay_total |
|---|---|---|---|---|---|---|---|---|
| str | str | i64 | str | str | f64 | f64 | f64 | f64 |
| "DEN" | "Denver, CO: Denver Internation… | 2005 | "January" | "Winter" | 0.233 | 0.928 | 0.935 | 1.11915 |
| "IAD" | "" | 2005 | "January" | "Winter" | 0.061 | 1.058 | 0.895 | 0.96015 |
| "ORD" | "Chicago, IL: Chicago O'Hare In… | 2005 | "January" | "Winter" | 0.306 | 2.255 | 5.415 | 4.50225 |
| "SAN" | "San Diego, CA: San Diego Inter… | 2005 | "January" | "Winter" | 0.056 | 0.68 | 0.638 | 0.6747 |
| "SFO" | "San Francisco, CA: San Francis… | 2005 | "January" | "Winter" | 0.114 | 0.733 | 1.166 | 1.0918 |
It seems like winter and fall have the most delays, most likely because you have to de ice the wings and whatnot. Spring and summer have fewer delays, but the low end of the boxes are all relatively in the same spot.
# Include and execute your code here
(
ggplot(df_weather, aes(x="season", y="weather_delay_total"))
+ geom_boxplot()
+ scale_x_discrete(limits=["Winter", "Spring", "Summer", "Fall"])
+ labs(
title="Weather-Related Flight Delays by Season",
x="Season",
y="Weather-Related Delays (in thousands)"
)
)