Code
import matplotlib.pyplot as plt
import seaborn as sns
We create a correlation matrix to explore how the variables are statistically related. For exploratory purposes, I included variables that were used in intermediate processing steps, in addition to the 10 variables of interest for our modelling.
import matplotlib.pyplot as plt
import seaborn as sns
# Create a list of all variables
= ['MedHHInc','TotalPop', 'TotalPop16', 'LabForTotal', 'Unemployed','PctBach', 'PovertyRate', 'UnemploymentRate', 'LabForParticipationRate', 'netexport', 'REALGDP', 'life_expectancy', 'Labor_Productivity_2023', 'REALGDPpercapita']
variables
# Create a list of selected variables for later analysis
= ['REALGDPpercapita','life_expectancy','MedHHInc','PctBach','UnemploymentRate','LabForParticipationRate', 'Labor_Productivity_2023', 'TotalPop', 'PovertyRate', 'netexport']
selected_variables
# Calculate the correlation matrix
= us_rescaled_final[variables].corr()
corr_matrix
# Plot the correlation matrix using seaborn
=(10, 8))
plt.figure(figsize=True, cmap='coolwarm', vmin=-1, vmax=1)
sns.heatmap(corr_matrix, annot'Correlation Matrix')
plt.title( plt.show()
We observe some interesting correlations. Real GDP has high positive correlation with population, size of labour force and number of employed. When these variables are converted into ratios such as labour force participation rate and unemployment rate, the correlation becomes weaker.
There are some reasonable correlation expectations. For example, percentage of bachelor’s degree graduates has a high negative correlation with poverty rate and a high positive correlation with median household income, labor force participation rate and life expectancy. Meanwhile. poverty rate has high negative correlation with median household income, labor force paritcipation rate and life expecatancy.
We can visualise these relationships further using line charts. I first create a repeated chart for all the variables. From an aesthetic perspective, the visualization can be improved further.
import altair as alt
# Setup the selection brush
= alt.selection_interval()
brush
# Repeated chart
(
alt.Chart(us_rescaled_final)
.mark_circle()
.encode(=alt.X(alt.repeat("column"), type="quantitative", scale=alt.Scale(zero=False)),
x=alt.Y(alt.repeat("row"), type="quantitative", scale=alt.Scale(zero=False)),
y=alt.condition(
color"NAME_x:N", alt.value("lightgray")
brush, # conditional color
), =['NAME_x'] + variables
tooltip
)
.properties(=200,
width=200,
height
)
.add_params(brush)# repeat variables across rows and columns
.repeat( =variables,
row=variables,
column
) )
We can improve the visualisation by creating an interactive bubble plot inspired by Gapminder: https://www.gapminder.org/tools/#$chart-type=bubbles&url=v2. This allows us to select the variables we are interested in and see their distribution
# Define dropdown bindings for both x and y axes
= alt.binding_select(
dropdown_x =['MedHHInc','TotalPop', 'TotalPop16', 'LabForTotal', 'Unemployed','PctBach', 'PovertyRate', 'UnemploymentRate', 'LabForParticipationRate', 'netexport', 'REALGDP', 'life_expectancy', 'Labor_Productivity_2023'],
options='X-axis column '
name
)= alt.binding_select(
dropdown_y =['MedHHInc','TotalPop', 'TotalPop16', 'LabForTotal', 'Unemployed','PctBach', 'PovertyRate', 'UnemploymentRate', 'LabForParticipationRate', 'netexport', 'REALGDP', 'life_expectancy', 'Labor_Productivity_2023'],
options='Y-axis column '
name
)= alt.binding_select(
dropdown_size =['MedHHInc','TotalPop', 'TotalPop16', 'LabForTotal', 'Unemployed','PctBach', 'PovertyRate', 'UnemploymentRate', 'LabForParticipationRate', 'netexport', 'REALGDP', 'life_expectancy', 'Labor_Productivity_2023'],
options='Bubble Size '
name
)
# Create parameters for x and y axes
= alt.param(
xcol_param ='MedHHInc',
value=dropdown_x
bind
)= alt.param(
ycol_param ='MedHHInc',
value=dropdown_y
bind
)= alt.param(
size_param ='MedHHInc',
value=dropdown_size
bind
)
= alt.Chart(us_rescaled_final).mark_circle().encode(
chart2 =alt.X('x:Q', scale=alt.Scale(zero=False, domain='unaggregated')).title(''),
x=alt.Y('y:Q', scale=alt.Scale(zero=False, domain='unaggregated')).title(''),
y=alt.Size('size:Q', scale=alt.Scale(zero=False, domain='unaggregated')).title(''),
size='NAME_x:N',
color=['NAME_x'] + variables # Concatenate NAME_x with the existing variables list
tooltip
).transform_calculate(=f'datum[{xcol_param.name}]',
x=f'datum[{ycol_param.name}]',
y=f'datum[{size_param.name}]'
size
).add_params(
xcol_param,
ycol_param,
size_param,=800, height=800)
).properties(width
chart2