from abc import ABC, abstractmethod, abstractproperty
from typing import Optional
from cmdstanpy import CmdStanModel
from typing import Optional
from enum import Enum
from dataclasses import dataclass
from pandas import DataFrame
from arviz import InferenceData, from_cmdstanpy
class StanModel(ABC):
"""An abstract base class for STAN models"""
def __init__(self, **kwargs):
self.model = CmdStanModel(stan_file=self.stan_file)
self.init_params = kwargs
@abstractproperty
def stan_file(self) -> str:
raise NotImplementedError
@abstractmethod
def construct_model_data(self, data: DataFrame) -> dict:
raise NotImplementedError
def _validate_data(self) -> None:
self.data_class(**self.model_data)
def fit(
self,
data: DataFrame,str] = None,
save_dir: Optional[bool = False,
verbose: bool = True,
map_initialization: int = 40392,
seed: int = 50_000,
iter_warmup: int = 50_000,
iter_sampling: **kwargs
-> InferenceData:
) if verbose:
print("Constructing model data...")
self.model_data = self.construct_model_data(data)
self._validate_data()
if map_initialization:
if verbose:
print("Fitting model with MAP initialization...")
= self._compute_map_estimate(seed)
map_estimate
if "inits" in kwargs:
# inits passed to fit() should override MAP
"inits"])
map_estimate.update(kwargs[
"inits"] = map_estimate
kwargs[
elif verbose:
print("Fitting model...")
# sample from the posterior starting at the MAP
self.raw_model_fit = self.model.sample(
=self.model_data,
data=iter_warmup,
iter_warmup=iter_sampling,
iter_sampling**kwargs
)
if save_dir is not None:
if verbose:
print("Saving model...")
self.save(save_dir)
if verbose:
print("Running MCMC diagnostics...")
print()
print(self.diagnose())
return self
def _compute_map_estimate(self, seed: int) -> dict:
# compute MAP fit
self.map_model_fit = self.model.optimize(
=self.model_data,
data=seed,
seed
)
return self.map_model_fit.stan_variables()
@property
def model_fit(self) -> InferenceData:
return from_cmdstanpy(
self.raw_model_fit,
=self.coords,
coords=self.dims
dims
)
def save(self, save_dir: str = "."):
self.raw_model_fit.save_csvfiles(save_dir)
@classmethod
def from_csv(cls, path: str, **kwargs) -> 'StanModel':
= cls(**kwargs)
model = cmdstanpy.from_csv(path)
model.raw_model_fit
def diagnose(self) -> str:
return self.raw_model_fit.diagnose()
Model fitting and comparison
As in Module 1, we will implement a relatively thin wrapper around cmdstanpy
’s CmdStanModel
class. This abstract base class (ABC) is effectively the same as our IslandEffectsModel
ABC. The important differences are: (i) that it accepts kwargs
in StanModel.__init__
, which we will use to pass in information for setting the priors on the by-context or by-verb intercepts; and (ii) that we don’t implement a default StanModel.construct_model_data
method. The reason for not implementing this method is that we will use StanModel
as an ABC for both the norming models and the projection models.
Also as in Module 1, we’ll want a way of mapping (or hashing) columns of our data to indices, which we’ll use for hashing verb, context, and subject identifiers. This version of hash_series
additionally allows us to specify the hashmap via the categories
parameter, which will be necessary for ensuring we are associating the correct prior on by-context intercepts with the correct context.
from numpy import ndarray
from pandas import Series, CategoricalDtype
def hash_series(series: Series, categories: Optional[list[str]] = None, indexation: int=1) -> tuple[ndarray, ndarray]:
"""Hash a series to numeric codes
Parameters
----------
column
The series to hash
index
The starting index (defaults to 1)
"""
# enforce 0- or 1-indexation
if indexation not in [0, 1]:
raise ValueError("Must choose either 0- or 1-indexation.")
# convert the series to a category
if categories is None:
= series.astype("category")
category_series else:
= CategoricalDtype(categories=categories)
cat_type = series.astype(cat_type)
category_series
# get the hash
= category_series.cat.categories.values
hash_map
# map to one-indexed codes
= (category_series.cat.codes + indexation).values
hashed_series
return hash_map, hashed_series
Model of prior beliefs
Our model for estimating prior beliefs from Degen and Tonhauser’s norming data will subclass StanModel
ABC and look similar to the models we wrote for Module 1. The main addition we make is a property NormingModel.context_posterior_estimates
which returns estimates of \(\mu^\text{context}_c\) and \(\sigma^\text{context}_c\), assuming that \(\rho_c \mid \mathbf{y}_\text{norming} \sim \mathcal{N}(\mu^\text{context}_c, \sigma^\text{context}_c)\).
from scipy.stats import norm
from pandas import merge
@dataclass
class NormingData:
int # number of responses
N_resp: int # number of contexts
N_context: int # number of subjects
N_subj: # context corresponding to response n
context: ndarray # subject who gave response n
subj: ndarray # likert scale acceptability judgment responses
resp: ndarray
class NormingModel(StanModel):
"""A STAN model for Degen and Tonhauser's (2021) norming data"""
= "models/norming-model/norming-model.stan"
stan_file = NormingData
data_class
def __init__(self):
super().__init__()
def construct_context_info(self, data: DataFrame):
self.context_info = data[["item", "prompt", "fact"]].drop_duplicates(ignore_index=True)
self.context_info = self.context_info.rename(columns={"item": "context"})
def construct_model_data(self, data: DataFrame):
self.construct_context_info(data)
if hasattr(self, "subj_hash_map"):
= hash_series(data.workerid, self.subj_hash_map)
_, subj_hashed else:
self.subj_hash_map, subj_hashed = hash_series(data.workerid)
if hasattr(self, "context_hash_map"):
= hash_series(data.item, self.context_hash_map)
_, context_hashed else:
self.context_hash_map, context_hashed = hash_series(data.item)
self.coords = {
"subj": self.subj_hash_map,
"context": self.context_hash_map
}
self.dims = {
"context_intercept": ["context"],
"context_prob": ["context"],
}
self.model_data = {
"N_resp": data.shape[0],
"N_context": self.context_hash_map.shape[0],
"N_subj": self.subj_hash_map.shape[0],
"context": context_hashed,
"subj": subj_hashed,
"resp": data.response.astype(float).values
}
return self.model_data
@property
def context_posterior_estimates(self):
= self.raw_model_fit.stan_variable("context_intercept")
context_intercept_samples
= []
params
for i in range(context_intercept_samples.shape[1]):
= norm.fit(context_intercept_samples[:,i])
mu, sigma = self.context_hash_map[i]
context
params.append([context, mu, sigma])
= DataFrame(params, columns=["context", "context_mean", "context_std"])
params_df "order"] = params_df.index
params_df[= merge(params_df, self.context_info).sort_values("order")
params_df
return params_df[["fact", "context", "prompt", "context_mean", "context_std", "order"]]
Load norming data
import os
from pandas import read_csv
= "data/"
data_dir
def load_norming_data(fname: str) -> DataFrame:
= read_csv(fname, index_col=0)
data
= data[~data.item.isin(["F1", "F2"])]
data
return data.drop(columns="comments")
= load_norming_data(
data_norming
os.path.join(
data_dir, "projective-probability/results/1-prior/data/cd.csv"
) )
Silence STAN logger
import logging
= logging.getLogger('cmdstanpy')
logger
logger.addHandler(logging.NullHandler())= False
logger.propagate logger.setLevel(logging.CRITICAL)
Fitting the model
We can then fit this model to the norming data.
= NormingModel()
norming_model = norming_model.fit(
_ =False
data_norming, map_initialization )
Investigating the fit
In plotting the posterior samples for \(\rho^\text{context}_c\), we observe a clear effect of itemType
–both in log-odds space…
from arviz import plot_forest
= plot_forest(
_
norming_model.model_fit,=["context_intercept"],
var_names=True,
combined=(11.5, 10)
figsize )
…and in probability space.
= plot_forest(
_
norming_model.model_fit,=["context_prob"],
var_names=True,
combined=(11.5, 10)
figsize )
If we look at the concrete example we looked at here, we see that these posterior estimates accord with expectation.
from scipy.special import expit
from pandas import merge, melt
# the norming data for 10H and 10L
= data_norming.query('item.isin(["10H", "10L"])')
data_norming_sub
# the samples from the posterior for 10H and 10L
= DataFrame(
samples "context_intercept"),
norming_model.raw_model_fit.stan_variable(=norming_model.context_hash_map
columns
)
= merge(
samples ="context", value_name="logodds"),
melt(samples, var_name
norming_model.context_info
)
"prob"] = expit(samples.logodds)
samples[
"itemType"] = samples.context.map(lambda x: x[-1])
samples[
= samples.query('context.isin(["10H", "10L"])') samples_sub
context | fact | prompt | logodds | prob | |
---|---|---|---|---|---|
0 | 10H | Zoe is a math major. | How likely is it that Zoe calculated the tip? | 1.85264 | 0.864437 |
1 | 10H | Zoe is a math major. | How likely is it that Zoe calculated the tip? | 1.72780 | 0.849131 |
2 | 10H | Zoe is a math major. | How likely is it that Zoe calculated the tip? | 1.79174 | 0.857140 |
3 | 10H | Zoe is a math major. | How likely is it that Zoe calculated the tip? | 1.88675 | 0.868385 |
4 | 10H | Zoe is a math major. | How likely is it that Zoe calculated the tip? | 1.72497 | 0.848768 |
... | ... | ... | ... | ... | ... |
15995 | 10L | Zoe is 5 years old. | How likely is it that Zoe calculated the tip? | -7.80021 | 0.000409 |
15996 | 10L | Zoe is 5 years old. | How likely is it that Zoe calculated the tip? | -6.95483 | 0.000953 |
15997 | 10L | Zoe is 5 years old. | How likely is it that Zoe calculated the tip? | -3.82970 | 0.021255 |
15998 | 10L | Zoe is 5 years old. | How likely is it that Zoe calculated the tip? | -5.37417 | 0.004613 |
15999 | 10L | Zoe is 5 years old. | How likely is it that Zoe calculated the tip? | -5.15523 | 0.005736 |
16000 rows × 5 columns
Plotting code
from matplotlib.pyplot import subplots
from seaborn import histplot
= subplots(1, 2, figsize=(10, 4))
fig, (ax1, ax2) "How likely is it that Zoe calculated the tip?")
fig.suptitle(
"Distribution of responses")
ax1.set_title(
= histplot(
p =data_norming_sub, x="response", hue="fact",
data=["Zoe is 5 years old.", "Zoe is a math major."],
hue_order=15,
bins=ax1,
ax="density"
stat
)
"Samples from the posterior")
ax2.set_title(
= histplot(
p =samples_sub, x="prob", hue="fact",
data=["Zoe is 5 years old.", "Zoe is a math major."],
hue_order=30,
bins=ax2,
ax="density"
stat )
Note that the posterior samples are for what amounts to the mean response, so we don’t expect the distribution of samples to be the same as the distribution of responses.
Estimating context-specific priors
Our aim in fitting this model is to be able to estimate context-specific priors. Ideally, we could just use the samples from the prior visualized above, but we can’t for practical reasons: STAN needs a known functional form for the prior. That is the point of trying to estimate \(\mu^\text{context}_c\) and \(\sigma^\text{context}_c\) under the assumption that \(\rho_c \mid \mathbf{y}_\text{norming} \sim \mathcal{N}(\mu^\text{context}_c, \sigma^\text{context}_c)\).
= norming_model.context_posterior_estimates
context_posterior_estimates "itemType"] = context_posterior_estimates.context.map(lambda x: x[-1])
context_posterior_estimates[= context_posterior_estimates.set_index("context") context_posterior_estimates
Plotting code
= histplot(
p =context_posterior_estimates, x="context_mean",
data="itemType", hue_order=["L", "H"], bins=15
hue
)
"Distribution of means for context-specific priors")
p.set_title(= p.set_xlabel(r"$\mu_c$") _
To assess how good this assumption of normality is, we can compare the empirical CDF derived from the posterior samples with the normal CDF implied by \(\mu^\text{context}_c\) and \(\sigma^\text{context}_c\) for a particular context \(c\).
When \(\mu_c\) is in the middle of the scale, the normal approximation is effectively perfect.
Plotting code
from numpy import mgrid
from scipy.special import logit
from statsmodels.distributions.empirical_distribution import ECDF
from matplotlib.pyplot import subplot, Axes
def plot_context_intercept_posterior(context_id: str, ax: Axes, axis: str="unit", plot_diff: bool=True):
= context_posterior_estimates.loc[context_id]
context_estimates
= norm(context_estimates.context_mean, context_estimates.context_std)
estimated_dist
= norming_model.raw_model_fit.stan_variable("context_intercept")[:,context_estimates.order]
samples
if axis == "unit":
= mgrid[0.01:1:0.01]
x_axis
= expit(samples)
samples
ax.plot(
x_axis,
ECDF(samples)(x_axis),="ECDF"
label
)
ax.plot(
x_axis,
estimated_dist.cdf(logit(x_axis)), ="Normal approximation"
label
)
if plot_diff:
ax.plot(
x_axis, - estimated_dist.cdf(logit(x_axis)),
ECDF(samples)(x_axis) ="difference"
label
)
elif axis=="reals":
= mgrid[samples.min():samples.max():0.01]
x_axis
ax.plot(
x_axis,
ECDF(samples)(x_axis),="ECDF"
label
)
ax.plot(
x_axis, estimated_dist.cdf(x_axis),="Normal approximation"
label
)
if plot_diff:
ax.plot(
x_axis, - estimated_dist.cdf(x_axis),
ECDF(samples)(x_axis) ="Difference"
label
)
else:
raise ValueError("'axis' must be \"unit\" or \"reals\".")
return ax
= subplots(1, 2, figsize=(10, 4))
fig, (ax1, ax2) "Josh is a 5-year old boy.\nHow likely is it that Josh learned to ride a bike yesterday?")
fig.suptitle(
"16H", axis="reals", ax=ax1)
plot_context_intercept_posterior("16H", axis="unit", ax=ax2)
plot_context_intercept_posterior(
ax1.legend()
"Log-odds")
ax1.set_xlabel(= ax2.set_xlabel("Probability") _
When the likelihood is low, the approximation is slightly worse, though it remains quite good.
= subplots(1, 2, figsize=(10, 4))
fig, (ax1, ax2) "Isabella is a vegetarian.\nHow likely is it that Isabella ate a steak on Sunday?")
fig.suptitle(
"7L", axis="reals", ax=ax1)
plot_context_intercept_posterior("7L", axis="unit", ax=ax2)
plot_context_intercept_posterior(
ax1.legend()
"Log-odds")
ax1.set_xlabel(= ax2.set_xlabel("Probability") _
A similar phenomenon is observed when the mean is high–though, again, the approximation remains quite good.
Plotting code
= subplots(1, 2, figsize=(10, 4))
fig, (ax1, ax2) "Mary is taking a prenatal yoga class.\nHow likely is it that Mary is pregnant?")
fig.suptitle(
"1H", axis="reals", ax=ax1)
plot_context_intercept_posterior("1H", axis="unit", ax=ax2)
plot_context_intercept_posterior(
ax1.legend()
"Log-odds")
ax1.set_xlabel(= ax2.set_xlabel("Probability") _
Models of projection
Turning now to the models of the projection data: we’ll also implement these as a subclass of our StanModel
ABC. Because there are a few different versions of this model we’ll want to use–one that uses context-specific priors and another that uses verb-specific priors–we’ll need to set this class up in a slightly more complicated way. Basically, we’ll store different blocks of STAN code in different files and have class construct the full model specification on the fly based on the parameters to ProjectionModel.__init__
.
To actually use the estimates for context- or verb-specific priors from some other model fit, we’ll additionally need to pass that fit model to ProjectionModel.__init__
. This makes the initialization logic–as well as the data construction logic–somewhat complex, while keeping the core fitting procedure the same.
from numpy import zeros, ones
@dataclass
class ProjectionData(NormingData):
int # number of verbs
N_verb: # verb corresponding to response n
verb: ndarray # the verb means inferred from a previous model fit
verb_mean: ndarray # the verb standard deviations inferred from a previous model fit
verb_std: ndarray # the context means inferred from the norming data
context_mean: ndarray # the context standard deviations inferred from the norming data context_std: ndarray
from typing import Union
= {
parameters_and_model_block_files "no_priors_fixed": "parameters-and-model-block.stan",
"verb_priors_fixed": "parameters-and-model-block-verb-prior-fixed.stan",
"context_priors_fixed": "parameters-and-model-block-context-prior-fixed.stan",
"both_priors_fixed": "parameters-and-model-block-context-and-verb-priors-fixed.stan",
}
class ProjectionModel(StanModel):
= "models/projection-model/data-block.stan"
stan_data_block_file = "models/projection-model/generated-quantities-block.stan"
stan_generated_quantities_block_file
= ProjectionData
data_class
def __init__(
self, prior_model: Optional[Union[NormingModel, 'ProjectionModel']] = None,
bool = True
use_context_prior:
):self.prior_model = prior_model
self.use_context_priors = use_context_prior and prior_model is not None
self.use_verb_priors = hasattr(
"verb_posterior_estimates"
prior_model,
)
if self.use_context_priors and self.use_verb_priors:
print("Model initialized with context- and verb-specific priors derived "
"from context- and verb-specific posteriors from prior_model.")
self.context_hash_map = prior_model.context_hash_map
self.verb_hash_map = prior_model.verb_hash_map
self.stan_parameters_and_model_block_file = os.path.join(
"models/projection-model/parameters-and-model-block/",
"both_priors_fixed"]
parameters_and_model_block_files[
)
elif self.use_context_priors:
print("Model initialized with context-specific priors derived "
"from context-specific posteriors from prior_model.")
self.context_hash_map = prior_model.context_hash_map
self.stan_parameters_and_model_block_file = os.path.join(
"models/projection-model/parameters-and-model-block/",
"context_priors_fixed"]
parameters_and_model_block_files[
)
elif self.use_verb_priors:
print("Model initialized with verb-specific priors derived "
"from verb-specific posteriors from prior_model.")
self.verb_hash_map = prior_model.verb_hash_map
self.stan_parameters_and_model_block_file = os.path.join(
"models/projection-model/parameters-and-model-block/",
"verb_priors_fixed"]
parameters_and_model_block_files[
)
else:
self.stan_parameters_and_model_block_file = os.path.join(
"models/projection-model/parameters-and-model-block/",
"no_priors_fixed"]
parameters_and_model_block_files[
)
self._write_stan_file()
super().__init__()
def _write_stan_file(self):
= open(self.stan_functions_block_file, "r").read()
functions_block = open(self.stan_data_block_file, "r").read()
data_block = open(self.stan_parameters_and_model_block_file, "r").read()
parameters_and_model_block = open(self.stan_generated_quantities_block_file, "r").read()
generated_quantities_block
print(f"Writing STAN file to {self.stan_file}...")
with open(self.stan_file, "w") as f:
+"\n\n")
f.write(functions_block+"\n\n")
f.write(data_block+"\n\n")
f.write(parameters_and_model_block
f.write(generated_quantities_block)
@abstractproperty
def stan_functions_block_file(self):
raise NotImplementedError
def construct_context_info(self, data: DataFrame):
if hasattr(self.prior_model, "context_info"):
self.context_info = self.prior_model.context_info
else:
"prompt"] = data["content"]
data[self, data)
NormingModel.construct_context_info(
def construct_model_data(self, data: DataFrame):
self.model_data = NormingModel.construct_model_data(self, data)
if hasattr(self, "verb_hash_map"):
= hash_series(data.verb, self.verb_hash_map)
_, verb_hashed else:
self.verb_hash_map, verb_hashed = hash_series(data.verb)
self.coords.update({
"verb": self.verb_hash_map
})
self.dims.update({
"verb_intercept": ["verb"],
"verb_prob": ["verb"]
})
self.model_data.update({
"N_verb": self.verb_hash_map.shape[0],
"verb": verb_hashed
})
if self.use_context_priors:
self.model_data.update({
"context_mean": self.context_prior_estimates.context_mean.values,
"context_std": self.context_prior_estimates.context_std.values
})else:
self.model_data.update({
"context_mean": zeros(self.model_data["N_context"]),
"context_std": ones(self.model_data["N_context"]),
})
if self.use_verb_priors:
self.model_data.update({
"verb_mean": self.verb_prior_estimates.verb_mean.values,
"verb_std": self.verb_prior_estimates.verb_std.values,
})else:
self.model_data.update({
"verb_mean": zeros(self.model_data["N_verb"]),
"verb_std": ones(self.model_data["N_verb"]),
})
return self.model_data
@property
def context_prior_estimates(self):
if self.use_context_priors:
return self.prior_model.context_posterior_estimates
else:
raise AttributeError("no prior_model supplied for context priors")
@property
def context_posterior_estimates(self):
= self.raw_model_fit.stan_variable("context_intercept")
context_intercept_samples
= []
params
for i in range(context_intercept_samples.shape[1]):
= norm.fit(context_intercept_samples[:,i])
mu, sigma = self.context_hash_map[i]
context
params.append([context, mu, sigma])
= DataFrame(params, columns=["context", "context_mean", "context_std"])
params_df "order"] = params_df.index
params_df[= merge(params_df, self.context_info).sort_values("order")
params_df
return params_df[["fact", "context", "prompt", "context_mean", "context_std", "order"]]
@property
def verb_prior_estimates(self):
if self.use_verb_priors:
return self.prior_model.verb_posterior_estimates
else:
raise AttributeError("prior_model must have verb_posterior_estimates")
@property
def verb_posterior_estimates(self):
= self.raw_model_fit.stan_variable("verb_intercept")
verb_intercept_samples
= []
params
for i in range(verb_intercept_samples.shape[1]):
= norm.fit(verb_intercept_samples[:,i])
mu, sigma = self.verb_hash_map[i]
verb
params.append([verb, mu, sigma])
= DataFrame(params, columns=["verb", "verb_mean", "verb_std"])
params_df "order"] = params_df.index
params_df[
return params_df
To implement a particular subtype of projection model, we then simply need to define a subclass that specifies where the functions
block is located–remember, we factored the models such that they differ only in their definitions of the likelihood function–and where to write the full model code out to.
class FullyDiscreteProjectionModel(ProjectionModel):
= "models/projection-model/fully-discrete/fully-discrete-likelihoods.stan"
stan_functions_block_file = "models/projection-model/fully-discrete/fully-discrete-model.stan"
stan_file
class VerbDiscreteProjectionModel(ProjectionModel):
= "models/projection-model/verb-discrete/verb-discrete-likelihoods.stan"
stan_functions_block_file = "models/projection-model/verb-discrete/verb-discrete-model.stan"
stan_file
class ContextDiscreteProjectionModel(ProjectionModel):
= "models/projection-model/context-discrete/context-discrete-likelihoods.stan"
stan_functions_block_file = "models/projection-model/context-discrete/context-discrete-model.stan"
stan_file
class FullyGradientProjectionModel(ProjectionModel):
= "models/projection-model/fully-gradient/fully-gradient-likelihoods.stan"
stan_functions_block_file = "models/projection-model/fully-gradient/fully-gradient-model.stan" stan_file
Load norming data
def load_projection_data(fname: str) -> DataFrame:
= read_csv(fname, index_col=0)
data
if "comments" in data.columns:
= data.drop(columns="comments")
data
= data[data.trigger_class != "control"]
data
"itemType"] = data.fact_type.str.replace("fact", "")
data["item"] = data.contentNr.astype(str) + data.fact_type.str.replace("fact", "")
data[
return data
= load_projection_data(
data_projection
os.path.join(
data_dir, "projective-probability/results/3-projectivity/data/cd.csv"
) )
Fitting the model
We can then fit each of the models. We’ll look at what different models learn about the verbs in more detail once we’ve fit them all and run our model comparison.
= FullyDiscreteProjectionModel(norming_model)
fully_discrete_projection_model = fully_discrete_projection_model.fit(data_projection) _
= VerbDiscreteProjectionModel(norming_model)
verb_discrete_projection_model = verb_discrete_projection_model.fit(data_projection) _
= ContextDiscreteProjectionModel(norming_model)
context_discrete_projection_model context_discrete_projection_model.fit(data_projection)
= FullyGradientProjectionModel(norming_model)
fully_gradient_projection_model = fully_gradient_projection_model.fit(data_projection) _
Model comparison
We can now run model comparison.
from arviz import compare, plot_compare
= {
models "Verb Discrete\nContext Discrete": fully_discrete_projection_model,
"Verb Discrete\nContext Gradient": verb_discrete_projection_model,
"Verb Gradient\nContext Discrete": context_discrete_projection_model,
"Verb Gradient\nContext Gradient": fully_gradient_projection_model
}
= compare({
projection_model_comparison for m_name, m in models.items()
m_name: m.model_fit })
Plotting code
= plot_compare(projection_model_comparison) _
The main thing to note here is that both models associated with the indeterminacy hypothesis dominate both models associated with the fundamental gradience hypothesis, with the verb discrete model performing the best by far.
Investigating the fits
We can now turn to understanding how each model fits the data. To do this, we can look at the probabilities associated with each verb.
from pandas import concat
= []
verb_probs
for m_name, m in models.items():
= DataFrame(
verb_probs_sub "verb_prob"),
m.raw_model_fit.stan_variable(=m.verb_hash_map
columns
)
"model"] = m_name
verb_probs_sub[
verb_probs.append(verb_probs_sub)
= concat(verb_probs)
verb_probs
= melt(verb_probs, id_vars="model") verb_probs
model | variable | value | |
---|---|---|---|
0 | Verb Discrete\nContext Discrete | acknowledge | 2.699850e-01 |
1 | Verb Discrete\nContext Discrete | acknowledge | 2.912600e-01 |
2 | Verb Discrete\nContext Discrete | acknowledge | 2.696550e-01 |
3 | Verb Discrete\nContext Discrete | acknowledge | 1.921890e-01 |
4 | Verb Discrete\nContext Discrete | acknowledge | 2.014510e-01 |
... | ... | ... | ... |
639995 | Verb Gradient\nContext Gradient | think | 4.363940e-22 |
639996 | Verb Gradient\nContext Gradient | think | 6.227660e-15 |
639997 | Verb Gradient\nContext Gradient | think | 1.147610e-17 |
639998 | Verb Gradient\nContext Gradient | think | 1.003010e-17 |
639999 | Verb Gradient\nContext Gradient | think | 2.235940e-10 |
640000 rows × 3 columns
Plotting code
from matplotlib.pyplot import subplots
from seaborn import boxplot
= verb_probs[verb_probs.model=="Verb Gradient\nContext Gradient"]
verb_probs_fully_gradient
= verb_probs_fully_gradient.groupby("variable")["value"].mean()
verb_order = verb_order.sort_values(ascending=False)
verb_order
= verb_probs.groupby("model")["value"].max()
model_order = model_order.sort_values(ascending=False)
model_order
= subplots(figsize=(11.5, 14))
fig, ax
= boxplot(
_
verb_probs, ="value", y="variable", hue="model",
x=verb_order.index,
order=model_order.index,
hue_order=0., ax=ax) fliersize
One interesting thing to note here is that both the fully gradient model (verb gradient-context gradient) and the contrext discrete model (verb gradient-context discrete) tend to have much more extreme probabilities associated with each verb than the two other models. We can see this pattern even more clearly if we plot the mean value for each verb.
Plotting code
from seaborn import kdeplot
= verb_probs.groupby(["model", "variable"]).value.mean().reset_index()
mean_verb_probs_by_model
= kdeplot(
_
mean_verb_probs_by_model, ="value", hue="model",
x=0., hue_order=model_order.index
cut )
What this pattern would seem to suggest is that the two models associated with the fundamental gradience hypothesis are, in some sense, trying to simulate those associated with the indeterminacy hypothesis.
Conversely, the probabilities associated with the models associated with the indeterminacy hypothesis, suggest much more variability in projectivity–consistent with the original observation by White and Rawlins (2018) and the later observations by Degen and Tonhauser (2022) and Kane, Gantt, and White (2022). Putting the findings of Kane, Gantt, and White (2022) together with these findings would seem to lend strong support to the indeterminacy hypothesis.
Modeling the bleached and templatic data
To further evaluate these models, let’s fit the variants discussed in the last section to the bleached and templatic datasets. The main change in how we fit these models–compared to the models we fit to Degen and Tonhauser data–is that we’ll use the models fit to their data to determine the verb-specific priors on the by-verb random intercepts.
Modeling the bleached data
First, we’ll fit to the bleached data.
Load bleached data
= load_projection_data(
data_projection_bleached
os.path.join(
data_dir, "projective-probability-replication/bleached.csv"
)
)
"workerid"] = data_projection_bleached.participant data_projection_bleached[
= FullyDiscreteProjectionModel(
fully_discrete_projection_model_bleached =False
fully_discrete_projection_model, use_context_prior
)
fully_discrete_projection_model_bleached.fit(=False,
data_projection_bleached, map_initialization )
= VerbDiscreteProjectionModel(
verb_discrete_projection_model_bleached =False
verb_discrete_projection_model, use_context_prior
)
verb_discrete_projection_model_bleached.fit(=False,
data_projection_bleached, map_initialization )
= ContextDiscreteProjectionModel(
context_discrete_projection_model_bleached =False
context_discrete_projection_model, use_context_prior
)
context_discrete_projection_model_bleached.fit(=False,
data_projection_bleached, map_initialization )
= FullyGradientProjectionModel(
fully_gradient_projection_model_bleached =False
fully_gradient_projection_model, use_context_prior
)
fully_gradient_projection_model_bleached.fit(=False,
data_projection_bleached, map_initialization )
In running the model comparison, we observe the same pattern of results we observed for Degen and Tonhauser’s data.
= compare({
projection_model_bleached_comparison "Verb Discrete\nContext Discrete": fully_discrete_projection_model_bleached.model_fit,
"Verb Discrete\nContext Gradient": verb_discrete_projection_model_bleached.model_fit,
"Verb Gradient\nContext Discrete": context_discrete_projection_model_bleached.model_fit,
"Verb Gradient\nContext Gradient": fully_gradient_projection_model_bleached.model_fit
})
Plotting code
= plot_compare(projection_model_bleached_comparison) _
We can also see that the standard deviation of the by-context intercepts is extremely small–especially compared to the standard deviations of the by-subject intercepts.1
Plotting code
= plot_forest(
_
verb_discrete_projection_model_bleached.model_fit,=["context_intercept_std", "subj_intercept_verb_std", "subj_intercept_context_std"],
var_names=True,
combined=(11.5, 2),
figsize )
This small standard deviation is what we should expect here: one cannot in fact have prior beliefs about the beliefs contexts, so subjects just assume that the bleached content has a roughly 50-50 chance of being true.
Modeling the templatic data
We’ll do the same for the templatic data.
Load templatic data
= load_projection_data(
data_projection_templatic
os.path.join(
data_dir, "projective-probability-replication/templatic.csv"
)
)
"workerid"] = data_projection_templatic.participant data_projection_templatic[
= FullyDiscreteProjectionModel(
fully_discrete_projection_model_templatic =False
fully_discrete_projection_model, use_context_prior
)= fully_discrete_projection_model_templatic.fit(
_ =False,
data_projection_templatic, map_initialization )
= VerbDiscreteProjectionModel(
verb_discrete_projection_model_templatic =False
verb_discrete_projection_model, use_context_prior
)= verb_discrete_projection_model_templatic.fit(
_ =False,
data_projection_templatic, map_initialization )
= ContextDiscreteProjectionModel(
context_discrete_projection_model_templatic =False
context_discrete_projection_model, use_context_prior
)= context_discrete_projection_model_templatic.fit(
_ =False,
data_projection_templatic, map_initialization )
= FullyGradientProjectionModel(
fully_gradient_projection_model_templatic =False
fully_gradient_projection_model, use_context_prior
)= fully_gradient_projection_model_templatic.fit(
_ =False,
data_projection_templatic, map_initialization )
= compare({
projection_model_templatic_comparison "Verb Discrete\nContext Discrete": fully_discrete_projection_model_templatic.model_fit,
"Verb Discrete\nContext Gradient": verb_discrete_projection_model_templatic.model_fit,
"Verb Gradient\nContext Discrete": context_discrete_projection_model_templatic.model_fit,
"Verb Gradient\nContext Gradient": fully_gradient_projection_model_templatic.model_fit
})
In running this model comparison, we observe a similar pattern of results, with the verb discrete model pulling even father ahead.
Plotting code
= plot_compare(projection_model_templatic_comparison) _
As expected, we also observe that the standard deviation of the by-context intercepts is extremely small, which we expect for the same reasons we expected in for the models fit to the bleached data.
Plotting code
= plot_forest(
_
verb_discrete_projection_model_templatic.model_fit,=["context_intercept_std", "subj_intercept_verb_std", "subj_intercept_context_std"],
var_names=True,
combined=(11.5, 2),
figsize )
Summing up
In this module, we considered two subtly distinct questions: (i) whether there is evidence for discrete classes of lexical representations that determine inferences commonly associated with factive predicates or whether this knowledge is fundamentally continuous; and (ii) how, for aspects of lexical knowledge that are fundamentally continuous, that knowledge is integrated with world knowledge. Relevant to the first question, we saw evidence from Kane, Gantt, and White (2022) that, when we appropriately account for various sources of gradience in inference judgments, we observe a small number of clear cluster of predicates, all of which correspond cleanly to the predicate classes one might expect from the literature of clause-embedding predicates and a subset of which correspond to traditional subclassifications of factives. To address the second question, we modeled data collected by Degen and Tonhauser (2021) showing that models assuming that gradience comes from indeterminacy outperform models that assume fundamental gradience.
References
Footnotes
Remember that this standard deviation is in log-odds space.↩︎