import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Personal income (total/persons) and assistance datasets
# Change the file path depending on where your location of notebook and CSV files
g17b_2016 = pd.read_csv("../data/2016_GCP_AU_for_AUS_short-header/2016 Census GCP Australia for AUST/2016Census_G17B_AUS.csv")
g17c_2016 = pd.read_csv("../data/2016_GCP_AU_for_AUS_short-header/2016 Census GCP Australia for AUST/2016Census_G17C_AUS.csv")
g18_2016  = pd.read_csv("../data/2016_GCP_AU_for_AUS_short-header/2016 Census GCP Australia for AUST/2016Census_G18_AUS.csv")

g17b_2021 = pd.read_csv("../data/2021_GCP_AUS_for_AUS_short-header/2021 Census GCP Australia for AUS/2021Census_G17B_AUS_AUS.csv")
g17c_2021 = pd.read_csv("../data/2021_GCP_AUS_for_AUS_short-header/2021 Census GCP Australia for AUS/2021Census_G17C_AUS_AUS.csv")
g18_2021  = pd.read_csv("../data/2021_GCP_AUS_for_AUS_short-header/2021 Census GCP Australia for AUS/2021Census_G18_AUS_AUS.csv")

ERROR! Session/line number was not unique in database. History logging moved to new session 5

# 1. Inspect shape of the datasets. Tells us the number of rows and columns.
print("G17B_2016 shape:", g17b_2016.shape)
print("G17C_2016 shape:", g17c_2016.shape)
print("G17B_2021 shape:", g17b_2021.shape)
print("G17C_2021 shape:", g17c_2021.shape, "\n")
print("G18_2016 shape:", g18_2016.shape)
print("G18_2021 shape:", g18_2021.shape, "\n")

G17B_2016 shape: (1, 201)
G17C_2016 shape: (1, 81)
G17B_2021 shape: (1, 201)
G17C_2021 shape: (1, 111) 

G18_2016 shape: (1, 145)
G18_2021 shape: (1, 145)

# 2. Example: datatype used in G17C_2021 and G18_2016 dataframes. All dataframes represent values as int64. 
print("Data types in G17C_2021:\n", g17c_2021.dtypes)
print("Data types in G18_2016:\n", g18_2016.dtypes)

# For conciseness, data types of only 2 dataframes shown.

Data types in G17C_2021:
 AUS_CODE_2021          object
P_650_799_15_19_yrs     int64
P_650_799_20_24_yrs     int64
P_650_799_25_34_yrs     int64
P_650_799_35_44_yrs     int64
                        ...  
P_Tot_55_64_yrs         int64
P_Tot_65_74_yrs         int64
P_Tot_75_84_yrs         int64
P_Tot_85ov              int64
P_Tot_Tot               int64
Length: 111, dtype: object
Data types in G18_2016:
 AUS_CODE_2016                    int64
M_0_4_yrs_Need_for_assistance    int64
M_0_4_No_need_for_assistance     int64
M_0_4_Need_for_assistance_ns     int64
M_0_4_yrs_Tot                    int64
                                 ...  
P_85_yrs_over_Tot                int64
P_Tot_Need_for_assistance        int64
P_Tot_No_need_for_assistance     int64
P_Tot_Need_for_assistance_ns     int64
P_Tot_Tot                        int64
Length: 145, dtype: object

# 3. Example: column name descriptors used in G18_2021 dataframe.
print("Column names in G18_2021:\n", g18_2021.columns[::7]) 
# Every 7th column name shown to keep output concise. Splicing steps was changed to view all errors in column names during investigation.

# 4. Example: persons data is split between g17b and g17c.
print("Column names in G17b_2016:\n", g17b_2016.columns) 
print("Column names in G17c_2016:\n", g17c_2016.columns[:10])

Column names in G18_2021:
 Index(['AUS_CODE_2021', 'M_5_14_Need_for_assistance_ns',
       'M_20_24_No_need_for_assistnce', 'M_35_44_Need_for_assistance',
       'M_45_54_yrs_Tot', 'M_65_74_Need_for_assistnce_ns',
       'M_85_ov_No_need_for_assistnce', 'F_0_4_yrs_Need_for_assistance',
       'F_5_14_yrs_Tot', 'F_20_24_Need_for_assistnce_ns',
       'F_35_44_No_need_for_assistnce', 'F_55_64_Need_for_assistance',
       'F_65_74_yrs_Tot', 'F_85_ov_Need_for_assistnce_ns',
       'P_0_4_No_need_for_assistance', 'P_15_19_Need_for_assistance',
       'P_20_24_yrs_Tot', 'P_35_44_Need_for_assistnce_ns',
       'P_55_64_No_need_for_assistnce', 'P_75_84_Need_for_assistance',
       'P_85_yrs_over_Tot'],
      dtype='object')
Column names in G17b_2016:
 Index(['AUS_CODE_2016', 'F_400_499_15_19_yrs', 'F_400_499_20_24_yrs',
       'F_400_499_25_34_yrs', 'F_400_499_35_44_yrs', 'F_400_499_45_54_yrs',
       'F_400_499_55_64_yrs', 'F_400_499_65_74_yrs', 'F_400_499_75_84_yrs',
       'F_400_499_85ov',
       ...
       'P_800_999_15_19_yrs', 'P_800_999_20_24_yrs', 'P_800_999_25_34_yrs',
       'P_800_999_35_44_yrs', 'P_800_999_45_54_yrs', 'P_800_999_55_64_yrs',
       'P_800_999_65_74_yrs', 'P_800_999_75_84_yrs', 'P_800_999_85ov',
       'P_800_999_Tot'],
      dtype='object', length=201)
Column names in G17c_2016:
 Index(['AUS_CODE_2016', 'P_1000_1249_15_19_yrs', 'P_1000_1249_20_24_yrs',
       'P_1000_1249_25_34_yrs', 'P_1000_1249_35_44_yrs',
       'P_1000_1249_45_54_yrs', 'P_1000_1249_55_64_yrs',
       'P_1000_1249_65_74_yrs', 'P_1000_1249_75_84_yrs', 'P_1000_1249_85ov'],
      dtype='object')

# Personal income datasets (G17B and G17C) for both years
g17b_2016 = pd.read_csv("../data/2016_GCP_AU_for_AUS_short-header/2016 Census GCP Australia for AUST/2016Census_G17B_AUS.csv")
g17c_2016 = pd.read_csv("../data/2016_GCP_AU_for_AUS_short-header/2016 Census GCP Australia for AUST/2016Census_G17C_AUS.csv")
g17b_2021 = pd.read_csv("../data/2021_GCP_AUS_for_AUS_short-header/2021 Census GCP Australia for AUS/2021Census_G17B_AUS_AUS.csv")
g17c_2021 = pd.read_csv("../data/2021_GCP_AUS_for_AUS_short-header/2021 Census GCP Australia for AUS/2021Census_G17C_AUS_AUS.csv")

# Core Activity Need datasets (G18) for both years
g18_2016 = pd.read_csv("../data/2016_GCP_AU_for_AUS_short-header/2016 Census GCP Australia for AUST/2016Census_G18_AUS.csv")
g18_2021 = pd.read_csv("../data/2021_GCP_AUS_for_AUS_short-header/2021 Census GCP Australia for AUS/2021Census_G18_AUS_AUS.csv")

# 1. Keep only Persons columns (starting with "P_")
g18_2016_persons = g18_2016.filter(like="P_") # .filter only finds columns starting with "P_".
g18_2021_persons = g18_2021.filter(like="P_")

# 2. Identify "not stated" columns (contain 'ns') and drop them
# '_ns' is capitalised or lowercased, we need to find all of them
not_stated_cols_2016 = [col for col in g18_2016_persons.columns if "ns" in col.lower()] 
not_stated_cols_2021 = [col for col in g18_2021_persons.columns if "ns" in col.lower()]

g18_2016_persons = g18_2016_persons.drop(columns=not_stated_cols_2016)
g18_2021_persons = g18_2021_persons.drop(columns=not_stated_cols_2021)

# 3. Melt wide to long format
g18_2016_long = g18_2016_persons.melt(var_name="Category_Age", value_name="Count")
g18_2016_long["Year"] = 2016

g18_2021_long = g18_2021_persons.melt(var_name="Category_Age", value_name="Count")
g18_2021_long["Year"] = 2021

# 4. Combine both years into a single DataFrame
g18_df = pd.concat([g18_2016_long, g18_2021_long], ignore_index=True)

# 5. Drop Totals and under-15s
g18_df = g18_df[~g18_df["Category_Age"].str.contains("Tot", case=False)]
g18_df = g18_df[~g18_df["Category_Age"].str.contains("0_4|5_14", case=False)]

# 6. Extract AgeGroup by splitting on underscores
parts = g18_df["Category_Age"].str.split("_", expand=True) # Split the column name by underscores

g18_df["AgeGroup"] = parts[1] + "_" + parts[2] + "_yrs" # AgeGroup is usually the 1st–3rd tokens after "P"

# .str.split() method was inspired from the Python for Data Analysis Wes Micknney book CH7.4 String Manipulation[13]

# 7. Fix ABS typos for 85+ group
g18_df["AgeGroup"] = g18_df["AgeGroup"].replace({
    "85_over_yrs": "85_plus",
    "85_ov_yrs": "85_plus"
})

# 8. A simpler Category column (need vs no need for assistance) is created here using a lambda function
g18_df["Category"] = g18_df["Category_Age"].apply(lambda x: "Need_for_assistance" if "Need" in x else "No_need_for_assistance")

# 9. Drop rows where AgeGroup is NaN
# Some rows produced NaN AgeGroups after extraction, (specifically, the totals and other income categorise mentioned in 3.1). Since these 
# rows are not meaningful for age-based analysis, they were safely dropped.  This ensures only valid age intervals remain in the dataset.
g18_df = g18_df.dropna(subset=["AgeGroup"])

# With the .head() method, we can see the fruits of our cleaning so far.
g18_df.head()

# 1. Keep only Persons (P_) columns from G17B and G17C tables from both Census years
g17b_2016_persons = g17b_2016.filter(like="P_")
g17c_2016_persons = g17c_2016.filter(like="P_")
g17b_2021_persons = g17b_2021.filter(like="P_")
g17c_2021_persons = g17c_2021.filter(like="P_")

# 2. Merge the two tables into a single dataframe (along horizontal axis)
g17_2016_persons = pd.concat([g17b_2016_persons, g17c_2016_persons], axis=1)
g17_2021_persons = pd.concat([g17b_2021_persons, g17c_2021_persons], axis=1)

# 3. Drop Totals and Not stated columns
drop_cols_2016 = [col for col in g17_2016_persons.columns if "Tot" in col or "ns" in col.lower()]
drop_cols_2021 = [col for col in g17_2021_persons.columns if "Tot" in col or "ns" in col.lower()]

g17_2016_persons = g17_2016_persons.drop(columns=drop_cols_2016)
g17_2021_persons = g17_2021_persons.drop(columns=drop_cols_2021)

# 4. Melt wide to long format
g17_2016_long = g17_2016_persons.melt(var_name="Income_Age", value_name="Count")
g17_2016_long["Year"] = 2016

g17_2021_long = g17_2021_persons.melt(var_name="Income_Age", value_name="Count")
g17_2021_long["Year"] = 2021

# 5. Combine into one dataframe
g17_df = pd.concat([g17_2016_long, g17_2021_long], ignore_index=True)

# 6. Extract AgeGroup (e.g. 25_34_yrs)
g17_df["AgeGroup"] = g17_df["Income_Age"].str.extract(r"(\d+_\d+_yrs|85.*)")

# 7. Fix ABS typos for the 85+ age group (e.g. "85_yrs_ovr", "85ov")
g17_df["AgeGroup"] = g17_df["AgeGroup"].replace({
    "85_yrs_ovr": "85_plus",
    "85ov": "85_plus"
})

# 8. Extract IncomeBracket (e.g. 1000_1249, 1500_1749)
g17_df["IncomeBracket"] = g17_df["Income_Age"].str.extract(r"P_([^_]+_[^_]+|Neg.*|Nil.*)")

# 9. Combine 2021's "3000_3499" and "3500_more" into one "3000_more"
g17_df["IncomeBracket"] = g17_df["IncomeBracket"].replace({
    "3000_3499": "3000_more",
    "3500_more": "3000_more"
})

# 10. Fix ABS typos in IncomeBracket
# This manual fix was necessary because the ABS column labels contained small typos (e.g. "Negtve_Nil" instead of "Neg_Nil"). 
# Without correcting these, the same category would appear twice under different names, which would distort proportions and 
# make comparisons and plots misleading.
g17_df["IncomeBracket"] = g17_df["IncomeBracket"].replace({
    "Negtve_Nil": "Neg_Nil"
})

# With the .head() method, we can see the fruits of our cleaning so far.
g17_df.head()

def add_proportion(df, group_cols, count_col="Count", prop_col="Proportion"):
    """
    Add a proportion column so that counts within each group sum to 1.
    """
    df = df.copy() # Make sure to create a copy of the dataframe, prevents accidental editing of original.
    totals = df.groupby(group_cols)[count_col].transform("sum") # calculate group totals by using .transform() method with sum operation.
    df[prop_col] = df[count_col] / totals # divide counts by totals to calculate proportion
    return df

# Apply to both datasets
g18_final = add_proportion(g18_df, ["AgeGroup","Year"])
g17_final = add_proportion(g17_df, ["AgeGroup","Year"])

# Here we verify that the cleaning worked as expected:
print("Unique AgeGroups in G18:", g18_df["AgeGroup"].unique())
print("Unique Categories in G18:", g18_df["Category"].unique())

print("Unique AgeGroups in G17:", g17_df["AgeGroup"].unique())
print("Unique Categories in G17:", g17_df["IncomeBracket"].unique())

Unique AgeGroups in G18: ['15_19_yrs' '20_24_yrs' '25_34_yrs' '35_44_yrs' '45_54_yrs' '55_64_yrs'
 '65_74_yrs' '75_84_yrs' '85_plus']
Unique Categories in G18: ['Need_for_assistance' 'No_need_for_assistance']
Unique AgeGroups in G17: ['15_19_yrs' '20_24_yrs' '25_34_yrs' '35_44_yrs' '45_54_yrs' '55_64_yrs'
 '65_74_yrs' '75_84_yrs' '85_plus']
Unique Categories in G17: ['Neg_Nil' '1_149' '150_299' '300_399' '400_499' '500_649' '650_799'
 '800_999' '1000_1249' '1250_1499' '1500_1749' '1750_1999' '2000_2999'
 '3000_more']

# Grouped bar chart for Assistance Needs (Facet Grid)
g = sns.catplot(
    data=g18_final, # g18_final cleaned dataframe used
    x="AgeGroup",
    y="Proportion",
    hue="Category",
    col="Year",
    kind="bar",
    height=5,
    aspect=1.2,
    palette="deep",
    errorbar=None # removes error bars
)
# Title for the whole graph
g.fig.suptitle("Proportion of People Needing Assistance by Age Group (2016 vs 2021)", 
               fontsize=14, fontweight="bold")

# Axis labels with units
g.set_axis_labels("Age Group (years)", "Proportion of People (between 0.0 - 1.0)")
g.set_titles("Census {col_name}")
g.set(ylim=(0,1)) # upper and lower limit of y-axis
g.set(yticks=[0,0.2,0.4,0.6,0.8,1.0]) # increments of y-axis

# Using Seaborn for legend and changing its position
g._legend.set_title("Category")
g._legend.set_bbox_to_anchor((0.9, 0.9))
g._legend.set_loc("center")

# Rotate x-axis labels for both facets. We use for loop here since there are 2 graphs
for ax in g.axes.flat:
    ax.tick_params(axis='x', labelrotation=45)

plt.tight_layout()

# Saving Visualisations
# The generated figures are saved to the `figures/` directory for reuse in reports and inclusion in the project README.
plt.savefig("../figures/assistance_vs_age.png", dpi=300, bbox_inches="tight")

plt.show()

# This Facet Grid graph was inspired from chapter [13]

# Group the income brackets into sets of 3 so that the chart is easier to read
def income_grouper(bracket):
    if bracket in ["Neg_Nil", "1_149", "150_299"]:
        return "Very low (≤299)"
    elif bracket in ["300_399", "400_499", "500_649"]:
        return "Low (300–649)"
    elif bracket in ["650_799", "800_999", "1000_1249"]:
        return "Lower-mid (650–1249)"
    elif bracket in ["1250_1499", "1500_1749", "1750_1999"]:
        return "Upper-mid (1250–1999)"
    elif bracket in ["2000_2999", "3000_more"]:
        return "High (≥2000)"
    else:
        return None

# Apply the grouping function to make a new column
g17_final["IncomeGroup"] = g17_final["IncomeBracket"].apply(income_grouper)

# Now we summarise proportions by Age, Year and our new IncomeGroup
g17_grouped = (
    g17_final.groupby(["AgeGroup", "Year", "IncomeGroup"], observed=False)["Proportion"].sum().reset_index() 
    # Calculate sum of intervals after grouping by variables
    # [14]
)

# Make side-by-side bar charts for 2016 and 2021 (A facet grid) 
# axes array contains 2 plots (0 and 1)
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

# 2016 PLOT CODE:
df_2016 = g17_grouped[g17_grouped["Year"] == 2016].pivot(
    index="AgeGroup", columns="IncomeGroup", values="Proportion"
)
df_2016.plot.bar(
    stacked=True, # Stacked graph inspired from the Python for Data Analysis Wes Micknney book CH9.2 Plotting and Visualization[13]
    ax=axes[0], 
    colormap="tab20c", 
    legend=False   # no legend for this graph here, we only want one on the right of 2021 graph
)
axes[0].set_title("Census 2016")
axes[0].set_ylabel("Proportion of People (0.0–1.0)")
axes[0].set_xlabel("Age Group (years)")

# 2021 PLOT CODE
df_2021 = g17_grouped[g17_grouped["Year"] == 2021].pivot(
    index="AgeGroup", columns="IncomeGroup", values="Proportion"
)
df_2021.plot.bar(
    stacked=True, 
    ax=axes[1], 
    colormap="tab20c", 
    legend=True    # legend here
)
axes[1].set_title("Census 2021")
axes[1].set_ylabel("Proportion of People (0.0–1.0)")
axes[1].set_xlabel("Age Group (years)")
axes[1].legend(title="Income Group", bbox_to_anchor=(1.05, 1), loc="upper left")

# Title
plt.suptitle("Income Distribution by Age Group (Grouped in 3 Interval Brackets)", y=1.02, fontsize=14)
plt.tight_layout()

# Saving Visualisations
# The generated figures are saved to the `figures/` directory for reuse in reports and inclusion in the project README.
plt.savefig("../figures/income_vs_age.png", dpi=300, bbox_inches="tight")

plt.show()

Comparing Income and Assistance Needs Across Australian Census Data (2016 vs 2021)¶

Overview¶

Research Question¶

Project Structure (GitHub)¶

1. Introduction¶

2. Data sourcing¶

2.1 Census datasets overview¶

2.2 Accessing the data¶

2.3 Dataset contents and relavence¶

2.3.1 Opening the dataset¶

2.3.2 Inspecting summary information¶

2.3.3 Justifying dataset selection¶

3. Assumptions, implications and problem framing¶

3.1 Assumptions and implications¶

3.3 Problem reframed as individual goals¶

4. Data preperation and cleaning¶

4.1 Load relavent datasets¶

4.2 Cleaning G18 (Core activity need for assistance) data¶

4.3 Cleaning G17 (Total Personal income) data¶

4.4 Calculating proportions¶

5. Data visualisation¶

5.1 Visualisation: Assistance Needs by Age Group (2016 vs 2021)¶

5.2 Visualisation: Personal Income Distribution by Age Group (2016 vs 2021)¶

5.3 Relationship between Income and Assistance Needs¶

6. Discussion and interpretation¶

6.1 Assistance Needs by Age Group¶

6.2 Income by Age Group¶

6.3 Intersection of Income and Assistance Needs¶

6.4 Link back to our key goals¶

7. Conclusion¶

8. Sources¶

	Category_Age	Count	Year	AgeGroup	Category
6	P_15_19_Need_for_assistance	37003	2016	15_19_yrs	Need_for_assistance
7	P_15_19_No_need_for_assistnce	1301812	2016	15_19_yrs	No_need_for_assistance
9	P_20_24_Need_for_assistance	28908	2016	20_24_yrs	Need_for_assistance
10	P_20_24_No_need_for_assistnce	1423257	2016	20_24_yrs	No_need_for_assistance
12	P_25_34_Need_for_assistance	51908	2016	25_34_yrs	Need_for_assistance

	Income_Age	Count	Year	AgeGroup	IncomeBracket
0	P_Neg_Nil_income_15_19_yrs	625657	2016	15_19_yrs	Neg_Nil
1	P_Neg_Nil_income_20_24_yrs	200271	2016	20_24_yrs	Neg_Nil
2	P_Neg_Nil_income_25_34_yrs	239572	2016	25_34_yrs	Neg_Nil
3	P_Neg_Nil_income_35_44_yrs	187924	2016	35_44_yrs	Neg_Nil
4	P_Neg_Nil_income_45_54_yrs	171997	2016	45_54_yrs	Neg_Nil