# import required packages
import numpy as np
import datascience as ds

# These lines do some fancy plotting magic
import matplotlib
# Required to view plots in a notebook
%matplotlib inline
import matplotlib.pyplot as plt
# This is just to make the plots look a certain way
plt.style.use('fivethirtyeight')

# import datascience techniques
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score


cah = ds.Table.read_table("CAH_PulseoftheNation_FinalProject.csv")
cah.show(5)


#This cleans up the data to have more definitive answers instead of including "DK/REF" or "other" responses.
cah.num_rows
cah.labels
cah_clean = cah.where("Ranked Choice", ds.are.not_equal_to("DK/REF"))
cah_clean = cah_clean.where("Gender", ds.are.not_equal_to("DK/REF"))
cah_clean = cah_clean.where("Gender", ds.are.not_equal_to("Other"))
cah_clean = cah_clean.where("Race", ds.are.not_equal_to("DK/REF"))
cah_clean = cah_clean.where("Race", ds.are.not_equal_to("Other"))
cah_clean = cah_clean.where("Education", ds.are.not_equal_to("DK/REF"))
cah_clean = cah_clean.where("Education", ds.are.not_equal_to("Other"))
cah_clean = cah_clean.where("Political Affiliation", ds.are.not_equal_to("DK/REF"))
cah_clean = cah_clean.where("Political Leaning", ds.are.not_equal_to("DK/REF"))
cah_clean = cah_clean.where("Trump", ds.are.not_equal_to("DK/REF"))
cah_clean = cah_clean.where("Finances", ds.are.not_equal_to("DK/REF"))
cah_clean = cah_clean.where("Fair Elections", ds.are.not_equal_to("DK/REF"))
cah_clean = cah_clean.where("Woman President", ds.are.not_equal_to("DK/REF"))
cah_clean = cah_clean.where("Universal Healthcare", ds.are.not_equal_to("DK/REF"))

cah_clean.num_rows

486


#This creates support variables which can easier to interpret
support = np.where(cah_clean.column("Ranked Choice") == "Yes", 1, 0)
no_support = np.where(cah_clean.column("Ranked Choice") == "No", 1, 0)

cah_clean = cah_clean.with_columns(
    "Support", support,
    "No Support", no_support
)

cah_clean.show(5)


# Summary table for main data description including the number of people and percentage of support or no support

original_rows = cah.num_rows
cleaned_rows = cah_clean.num_rows

ranked_choice_yes = cah_clean.where("Ranked Choice", "Yes").num_rows
ranked_choice_no = cah_clean.where("Ranked Choice", "No").num_rows

support_rate = np.mean(cah_clean.column("Support"))
no_support_rate = np.mean(cah_clean.column("No Support"))

summary_table = ds.Table().with_columns(
    "Description", ds.make_array(
        "Original dataset",
        "Cleaned dataset",
        "Supports ranked choice voting",
        "Does not support ranked choice voting",
        "Support rate",
        "No support rate"),
    "Value", ds.make_array(
        original_rows,
        cleaned_rows,
        ranked_choice_yes,
        ranked_choice_no,
        support_rate,
        no_support_rate))

summary_table


#This table shows a simplified group count
political_leaning_counts = cah_clean.group("Political Leaning")
political_affiliation_counts = cah_clean.group("Political Affiliation")
education_counts = cah_clean.group("Education")
fair_elections_counts = cah_clean.group("Fair Elections")

# To make arrays
group_variable = ds.make_array()
group_category = ds.make_array()
group_count = ds.make_array()

# Political Leaning
for row in political_leaning_counts.rows:
    group_variable = np.append(group_variable, "Political Leaning")
    group_category = np.append(group_category, row.item("Political Leaning"))
    group_count = np.append(group_count, row.item("count"))

# Political Affiliation
for row in political_affiliation_counts.rows:
    group_variable = np.append(group_variable, "Political Affiliation")
    group_category = np.append(group_category, row.item("Political Affiliation"))
    group_count = np.append(group_count, row.item("count"))

# Education
for row in education_counts.rows:
    group_variable = np.append(group_variable, "Education")
    group_category = np.append(group_category, row.item("Education"))
    group_count = np.append(group_count, row.item("count"))

# Fair Elections
for row in fair_elections_counts.rows:
    group_variable = np.append(group_variable, "Fair Elections")
    group_category = np.append(group_category, row.item("Fair Elections"))
    group_count = np.append(group_count, row.item("count"))

# For the combined table
combined_group_counts = ds.Table().with_columns(
    "Variable", group_variable,
    "Group", group_category,
    "Count", group_count
)

combined_group_counts


# Summary statistics for age
age = cah_clean.column("Age")
np.mean(age), np.median(age), np.min(age), np.max(age)
# Average age by ranked choice voting support
cah_clean.select("Ranked Choice", "Age").group("Ranked Choice", np.mean)


overall = cah_clean.group("Ranked Choice")

plt.bar(overall.column("Ranked Choice"), overall.column("count"))
plt.title("Overall Support for Ranked Choice Voting")
plt.xlabel("Response")
plt.ylabel("Number of Respondents")
plt.show()


support_by_leaning = cah_clean.select("Political Leaning", "Support").group("Political Leaning", np.mean)
support_by_leaning
plt.bar(support_by_leaning.column("Political Leaning"), support_by_leaning.column("Support mean"))
plt.title("Support for Ranked Choice Voting by Political Leaning")
plt.xlabel("Political Leaning")
plt.ylabel("Proportion Supporting")
plt.show()


support_by_party = cah_clean.select("Political Affiliation", "Support").group("Political Affiliation", np.mean)
support_by_party
plt.bar(support_by_party.column("Political Affiliation"), support_by_party.column("Support mean"))
plt.title("Support for Ranked Choice Voting by Political Affiliation")
plt.xlabel("Political Affiliation")
plt.ylabel("Proportion Supporting")
plt.show()


support_by_education = cah_clean.select("Education", "Support").group("Education", np.mean)
support_by_education
plt.bar(support_by_education.column("Education"), support_by_education.column("Support mean"))
plt.title("Support for Ranked Choice Voting by Education")
plt.xlabel("Education")
plt.ylabel("Proportion Supporting")
plt.xticks(rotation = 30)
plt.show()


support_by_fair = cah_clean.select("Fair Elections", "Support").group("Fair Elections", np.mean)
support_by_fair
plt.bar(support_by_fair.column("Fair Elections"), support_by_fair.column("Support mean"))
plt.title("Support by Confidence in Fair Elections")
plt.xlabel("Confidence in Fair Elections")
plt.ylabel("Proportion Supporting")
plt.xticks(rotation = 30)
plt.show()


#This shows the observed difference
liberal_group = cah_clean.where("Political Leaning", "Liberal")
conservative_group = cah_clean.where("Political Leaning", "Conservative")

liberal_support = np.mean(liberal_group.column("Support"))
conservative_support = np.mean(conservative_group.column("Support"))

observed_difference = liberal_support - conservative_support

liberal_support, conservative_support, observed_difference

(0.63963963963963966, 0.36805555555555558, 0.27158408408408408)


bootstrap_differences = ds.make_array()

for i in np.arange(1000):
    liberal_sample = liberal_group.sample(liberal_group.num_rows, with_replacement = True)
    conservative_sample = conservative_group.sample(conservative_group.num_rows, with_replacement = True)
    
    liberal_sample_support = np.mean(liberal_sample.column("Support"))
    conservative_sample_support = np.mean(conservative_sample.column("Support"))
    
    difference = liberal_sample_support - conservative_sample_support
    bootstrap_differences = np.append(bootstrap_differences, difference)

left_bound = np.percentile(bootstrap_differences, 2.5)
right_bound = np.percentile(bootstrap_differences, 97.5)

left_bound, right_bound
#To make a graph
plt.hist(bootstrap_differences)
plt.title("Bootstrap Differences in Support")
plt.xlabel("Liberal Support - Conservative Support")
plt.ylabel("Frequency")
plt.show()


#This shows a prediction table
prediction_table = cah_clean.select(
    "Age",
    "Political Affiliation",
    "Political Leaning",
    "Education",
    "Fair Elections",
    "No Support")

prediction_table.show(5)


#Converting categories into numbers
democrat = np.where(prediction_table.column("Political Affiliation") == "Democrat", 1, 0)
republican = np.where(prediction_table.column("Political Affiliation") == "Republican", 1, 0)

liberal = np.where(prediction_table.column("Political Leaning") == "Liberal", 1, 0)
conservative = np.where(prediction_table.column("Political Leaning") == "Conservative", 1, 0)

college_degree = np.where(prediction_table.column("Education") == "College degree", 1, 0)
graduate_degree = np.where(prediction_table.column("Education") == "Graduate degree", 1, 0)

fair_elections_yes = np.where(prediction_table.column("Fair Elections") != "No", 1, 0)
ml_table = ds.Table().with_columns(
    "Age", prediction_table.column("Age"),
    "Democrat", democrat,
    "Republican", republican,
    "Liberal", liberal,
    "Conservative", conservative,
    "College Degree", college_degree,
    "Graduate Degree", graduate_degree,
    "Fair Elections Yes", fair_elections_yes,
    "No Support", prediction_table.column("No Support"))

ml_table.show(5)


#To split the data into train and test.
rows_to_take = int(ml_table.num_rows * 0.8)

shuffled = ml_table.sample(with_replacement = False)

train = shuffled.take(np.arange(rows_to_take))
test = shuffled.take(np.arange(rows_to_take, ml_table.num_rows))

train.num_rows, test.num_rows
predictors = train.drop("No Support").rows
outcome = train.column("No Support")

test_predictors = test.drop("No Support").rows
expected = test.column("No Support")


# For the KNN model
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X = predictors, y = outcome)

knn_predicted = knn.predict(test_predictors)
knn_accuracy = accuracy_score(expected, knn_predicted)
knn_precision = precision_score(expected, knn_predicted)
knn_recall = recall_score(expected, knn_predicted)

knn_accuracy, knn_precision, knn_recall

(0.59183673469387754, 0.58823529411764708, 0.61224489795918369)


#For the Decision Tree
tree = DecisionTreeClassifier(max_depth = 4)
tree.fit(X = predictors, y = outcome)

tree_predicted = tree.predict(test_predictors)
tree_accuracy = accuracy_score(expected, tree_predicted)
tree_precision = precision_score(expected, tree_predicted)
tree_recall = recall_score(expected, tree_predicted)

tree_accuracy, tree_precision, tree_recall

(0.58163265306122447, 0.59090909090909094, 0.53061224489795922)


#Now we can comapre the two models
model_results = ds.Table().with_columns(
    "Model", ds.make_array("KNN", "Decision Tree"),
    "Accuracy", ds.make_array(knn_accuracy, tree_accuracy),
    "Precision", ds.make_array(knn_precision, tree_precision),
    "Recall", ds.make_array(knn_recall, tree_recall))

model_results


plt.bar(model_results.column("Model"), model_results.column("Recall"))
plt.title("Model Comparison by Recall")
plt.xlabel("Model")
plt.ylabel("Recall for No Support")
plt.show()


model_results.sort("Recall", descending = True)


confusion_matrix(expected, knn_predicted)

array([[28, 21],
       [19, 30]])

Endorsement of Ranked Choice Voting in the United States¶

Introduction¶

Data Description¶

Data Summary¶

Inference: Hypothesis Test or Confidence Interval¶

Prediction¶

Conclusion¶

Gender	Age	Race	Education	Political Affiliation	Political Leaning	Trump	Finances	Fair Elections	Ranked Choice	Woman President	Universal Healthcare
Female	81	White	Some college	Democrat	Liberal	Strongly Disapprove	Not Very Often	No	No	Yes	Yes
Male	80	Asian	Some college	Democrat	Moderate	Somewhat Disapprove	Somewhat Often	Yes, somewhat confident	Yes	Yes	No
Female	65	Black	High school or less	Democrat	Moderate	Strongly Disapprove	Somewhat Often	Yes, somewhat confident	Yes	Yes	No
Male	24	Asian	College degree	Independent	Moderate	Strongly Disapprove	Not Very Often	Yes, somewhat confident	Yes	Yes	No
Male	74	White	Graduate degree	Democrat	Liberal	Strongly Disapprove	Not Very Often	Yes, very confident	Yes	Yes	Yes

Description	Value
Original dataset	800
Cleaned dataset	486
Supports ranked choice voting	250
Does not support ranked choice voting	236
Support rate	0.514403
No support rate	0.485597

Variable	Group	Count
Political Leaning	Conservative	144
Political Leaning	Liberal	111
Political Leaning	Moderate	231
Political Affiliation	Democrat	192
Political Affiliation	Independent	156
Political Affiliation	Republican	138
Education	College degree	153
Education	Graduate degree	105
Education	High school or less	121
Education	Some college	107

Model	Accuracy	Precision	Recall
KNN	0.591837	0.588235	0.612245
Decision Tree	0.581633	0.590909	0.530612