From 0639ea9fb47c76ea5c8505714815d60b24b6c97d Mon Sep 17 00:00:00 2001 From: William Davies Date: Sun, 7 Jun 2026 21:26:45 -0500 Subject: [PATCH] Added Fama-French 3-factor and 5-factor expected return models --- README.md | 4 + docs/ExpectedReturns.rst | 16 +- pypfopt/expected_returns.py | 125 ++++++++++++++- tests/test_expected_returns.py | 274 +++++++++++++++++++++++++++++++++ 4 files changed, 417 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6aaa632e..9405f790 100755 --- a/README.md +++ b/README.md @@ -211,6 +211,10 @@ A far more comprehensive version of this can be found on [ReadTheDocs](https://p - Capital Asset Pricing Model (CAPM): - a simple model to predict returns based on the beta to the market - this is used all over finance! +- Fama-French 3-factor model: + - estimates expected returns using exposures to market, size, and value factors +- Fama-French 5-factor model: + - extends FF3 with profitability and investment factors ### Risk models (covariance) diff --git a/docs/ExpectedReturns.rst b/docs/ExpectedReturns.rst index a1c7247c..2cbc6e54 100755 --- a/docs/ExpectedReturns.rst +++ b/docs/ExpectedReturns.rst @@ -6,7 +6,7 @@ Expected Returns Mean-variance optimization requires knowledge of the expected returns. In practice, these are rather difficult to know with any certainty. Thus the best we can do is to -come up with estimates, for example by extrapolating historical data, This is the +come up with estimates, for example by extrapolating historical data or factor models. This is the main flaw in mean-variance optimization – the optimization procedure is sound, and provides strong mathematical guarantees, *given the correct inputs*. This is one of the reasons why I have emphasised modularity: users should be able to come up with their own @@ -51,6 +51,20 @@ superior models and feed them into the optimizer. .. autofunction:: returns_from_prices .. autofunction:: prices_from_returns + + .. autofunction:: ff_return + + Estimates expected returns using the Fama-French factor models. + + Set ``model="ff3"`` to use the three-factor specification with + market (Mkt-RF), size (SMB), and value (HML) factors. + + Set ``model="ff5"`` to use the five-factor specification, which + additionally includes profitability (RMW) and investment (CMA) + factors. + + Factor data must be supplied via the ``factor_data`` argument and + contain the required factor return columns indexed by date. .. References diff --git a/pypfopt/expected_returns.py b/pypfopt/expected_returns.py index bbbf69c6..2a739206 100644 --- a/pypfopt/expected_returns.py +++ b/pypfopt/expected_returns.py @@ -16,6 +16,7 @@ - mean historical return - exponentially weighted mean historical return - CAPM estimate of returns + - Fama-French 3-factor and 5-factor estimates of returns Additionally, we provide utility functions to convert from returns to prices and vice-versa. """ @@ -105,6 +106,8 @@ def return_model(prices, method="mean_historical_return", **kwargs): - ``mean_historical_return`` - ``ema_historical_return`` - ``capm_return`` + - ``ff3_return`` + - ``ff5_return`` Raises ------ @@ -114,7 +117,7 @@ def return_model(prices, method="mean_historical_return", **kwargs): Returns ------- pd.DataFrame - annualised sample covariance matrix + annualised expected return estimate for each asset """ if method == "mean_historical_return": return mean_historical_return(prices, **kwargs) @@ -122,6 +125,10 @@ def return_model(prices, method="mean_historical_return", **kwargs): return ema_historical_return(prices, **kwargs) elif method == "capm_return": return capm_return(prices, **kwargs) + elif method == "ff3_return": + return ff_return(prices, model="ff3", **kwargs) + elif method == "ff5_return": + return ff_return(prices, model="ff5", **kwargs) else: raise NotImplementedError("Return model {} not implemented".format(method)) @@ -313,3 +320,119 @@ def capm_return( # CAPM formula return risk_free_rate + betas * (mkt_mean_ret - risk_free_rate) + + +def ff_return( + prices, + factor_data, + returns_data=False, + model="ff3", + compounding=True, + frequency=252, + log_returns=False, +): + """ + Compute a return estimate using the Fama-French factor model. + + Parameters + ---------- + prices : pd.DataFrame + adjusted closing prices of the assets. + factor_data : pd.DataFrame + factor returns indexed by date. + + Required columns for ff3: + - RF + - Mkt-RF + - SMB + - HML + + Additional required columns for ff5: + - RMW + - CMA + + returns_data : bool, optional + if true, prices is interpreted as returns. + model : str, optional + one of {"ff3", "ff5"}. + compounding : bool, optional + use geometric annualisation if True. + frequency : int, optional + periods per year. + log_returns : bool, optional + whether to compute log returns. + + Returns + ------- + pd.Series + annualised expected returns. + """ + + if not isinstance(prices, pd.DataFrame): + warnings.warn("prices are not in a dataframe", RuntimeWarning) + prices = pd.DataFrame(prices) + + if not isinstance(factor_data, pd.DataFrame): + warnings.warn("factor_data is not in a dataframe", RuntimeWarning) + factor_data = pd.DataFrame(factor_data) + + if model not in {"ff3", "ff5"}: + raise ValueError("model must be either 'ff3' or 'ff5'") + + if returns_data: + returns = prices.copy() + else: + returns = returns_from_prices(prices, log_returns) + + _check_returns(returns) + + required = ["RF", "Mkt-RF", "SMB", "HML"] + if model == "ff5": + required.extend(["RMW", "CMA"]) + + missing = [c for c in required if c not in factor_data.columns] + if missing: + raise ValueError(f"factor_data missing required columns: {missing}") + + common_index = returns.index.intersection(factor_data.index) + if len(common_index) == 0: + raise ValueError("No overlapping dates between asset returns and factor data") + + returns = returns.loc[common_index] + factors = factor_data.loc[common_index, required].copy() + + data = returns.join(factors, how="inner").dropna() + if data.empty: + raise ValueError("No valid rows after aligning returns and factor data") + + returns = data[returns.columns] + factors = data[required] + + excess_returns = returns.sub(factors["RF"], axis=0) + + factor_cols = ["Mkt-RF", "SMB", "HML"] + if model == "ff5": + factor_cols.extend(["RMW", "CMA"]) + + X = np.column_stack([np.ones(len(factors)), factors[factor_cols].to_numpy()]) + factor_means = factors[factor_cols].mean().to_numpy() + + expected_returns = {} + rf_mean = factors["RF"].mean() + + for asset in excess_returns.columns: + y = excess_returns[asset].to_numpy() + beta = np.linalg.lstsq(X, y, rcond=None)[0] + alpha = beta[0] + factor_loadings = beta[1:] + + expected_period_return = rf_mean + alpha + factor_loadings @ factor_means + + if compounding: + expected_return = (1 + expected_period_return) ** frequency - 1 + else: + expected_return = expected_period_return * frequency + + expected_returns[asset] = expected_return + + return pd.Series(expected_returns, dtype="float64") diff --git a/tests/test_expected_returns.py b/tests/test_expected_returns.py index 4e16583b..d82112be 100644 --- a/tests/test_expected_returns.py +++ b/tests/test_expected_returns.py @@ -299,3 +299,277 @@ def test_log_return_passthrough(): except AssertionError: return assert False + + +def _make_ff_test_data(model="ff3", n_periods=120, n_assets=4, seed=42): + rng = np.random.default_rng(seed) + dates = pd.date_range("2020-01-01", periods=n_periods, freq="B") + + factor_cols = ["Mkt-RF", "SMB", "HML"] + if model == "ff5": + factor_cols += ["RMW", "CMA"] + + factors = pd.DataFrame( + { + "RF": rng.normal(0.0001, 0.00002, n_periods), + "Mkt-RF": rng.normal(0.0004, 0.008, n_periods), + "SMB": rng.normal(0.0002, 0.004, n_periods), + "HML": rng.normal(0.0001, 0.004, n_periods), + }, + index=dates, + ) + + if model == "ff5": + factors["RMW"] = rng.normal(0.00015, 0.003, n_periods) + factors["CMA"] = rng.normal(0.0001, 0.003, n_periods) + + betas = rng.normal(0.6, 0.15, size=(n_assets, len(factor_cols))) + alphas = rng.normal(0.0002, 0.00005, size=n_assets) + + factor_matrix = factors[factor_cols].to_numpy() + excess_returns = alphas + factor_matrix @ betas.T + returns = excess_returns + factors["RF"].to_numpy()[:, None] + + returns_df = pd.DataFrame( + returns, index=dates, columns=[f"Asset {i+1}" for i in range(n_assets)] + ) + prices = expected_returns.prices_from_returns(returns_df) + prices.columns = returns_df.columns + + return prices, returns_df, factors + + +def test_ff3_return(): + prices, _, factors = _make_ff_test_data(model="ff3") + mu = expected_returns.ff_return(prices, factors, model="ff3") + + assert isinstance(mu, pd.Series) + assert list(mu.index) == list(prices.columns) + assert mu.notnull().all() + assert mu.dtype == "float64" + + +def test_ff5_return(): + prices, _, factors = _make_ff_test_data(model="ff5") + mu = expected_returns.ff_return(prices, factors, model="ff5") + + assert isinstance(mu, pd.Series) + assert list(mu.index) == list(prices.columns) + assert mu.notnull().all() + assert mu.dtype == "float64" + + +def test_ff_return_missing_columns(): + prices, _, factors = _make_ff_test_data(model="ff3") + factors = factors.drop(columns=["HML"]) + + with pytest.raises(ValueError, match="missing required columns"): + expected_returns.ff_return(prices, factors, model="ff3") + + +def test_ff_return_invalid_model(): + prices, _, factors = _make_ff_test_data(model="ff3") + + with pytest.raises(ValueError, match="model must be either"): + expected_returns.ff_return(prices, factors, model="ff4") + + +def test_ff_return_no_overlap(): + prices, _, factors = _make_ff_test_data(model="ff3") + factors.index = pd.date_range("2030-01-01", periods=len(factors), freq="B") + + with pytest.raises(ValueError, match="No overlapping dates"): + expected_returns.ff_return(prices, factors, model="ff3") + + +def test_return_model_ff3_return(): + prices, _, factors = _make_ff_test_data(model="ff3") + + mu1 = expected_returns.return_model(prices, method="ff3_return", factor_data=factors) + mu2 = expected_returns.ff_return(prices, factors) + + pd.testing.assert_series_equal(mu1, mu2) + + +def test_return_model_ff5_return(): + prices, _, factors = _make_ff_test_data(model="ff5") + + mu1 = expected_returns.return_model(prices, method="ff5_return", factor_data=factors) + mu2 = expected_returns.ff_return(prices, factors) + + pd.testing.assert_series_equal(mu1, mu2) + + +def _make_ff_known_data(model="ff3"): + rng = np.random.default_rng(7) + dates = pd.date_range("2021-01-01", periods=80, freq="B") + + factors = pd.DataFrame( + { + "RF": rng.normal(0.0001, 0.00002, len(dates)), + "Mkt-RF": rng.normal(0.0004, 0.008, len(dates)), + "SMB": rng.normal(0.0002, 0.004, len(dates)), + "HML": rng.normal(0.0001, 0.004, len(dates)), + }, + index=dates, + ) + + if model == "ff5": + factors["RMW"] = rng.normal(0.00015, 0.003, len(dates)) + factors["CMA"] = rng.normal(0.00010, 0.003, len(dates)) + + factor_cols = ["Mkt-RF", "SMB", "HML"] + if model == "ff5": + factor_cols += ["RMW", "CMA"] + + alphas = np.array([0.00020, 0.00010, -0.00005]) + betas = np.array( + [ + [1.10, 0.40, -0.20, 0.00, 0.00], + [0.80, -0.10, 0.35, 0.00, 0.00], + [1.25, 0.05, -0.15, 0.00, 0.00], + ] + ) + + if model == "ff3": + betas = betas[:, :3] + + factor_matrix = factors[factor_cols].to_numpy() + returns = factors["RF"].to_numpy()[:, None] + alphas + factor_matrix @ betas.T + + returns_df = pd.DataFrame( + returns, index=dates, columns=["Asset 1", "Asset 2", "Asset 3"] + ) + return returns_df, factors, alphas, betas, factor_cols + + +def test_ff5_return_missing_columns(): + prices, _, factors = _make_ff_test_data(model="ff5") + factors = factors.drop(columns=["CMA"]) + + with pytest.raises(ValueError, match="missing required columns"): + expected_returns.ff_return(prices, factors, model="ff5") + + +def test_ff_return_no_valid_rows_after_dropna(): + prices, _, factors = _make_ff_test_data(model="ff3") + factors = factors.copy() + factors[["RF", "Mkt-RF", "SMB", "HML"]] = np.nan + + with pytest.raises(ValueError, match="No valid rows after aligning"): + expected_returns.ff_return(prices, factors, model="ff3") + + +def test_ff3_return_recovers_known_linear_model(): + returns_df, factors, alphas, betas, factor_cols = _make_ff_known_data(model="ff3") + + mu = expected_returns.ff_return( + returns_df, + factors, + returns_data=True, + model="ff3", + compounding=False, + frequency=1, + ) + + expected = ( + factors["RF"].mean() + + alphas + + betas @ factors[factor_cols].mean().to_numpy() + ) + expected = pd.Series(expected, index=returns_df.columns, dtype="float64") + + pd.testing.assert_series_equal(mu, expected, check_exact=False, rtol=1e-10, atol=1e-12) + + +def test_ff5_return_recovers_known_linear_model(): + returns_df, factors, alphas, betas, factor_cols = _make_ff_known_data(model="ff5") + + mu = expected_returns.ff_return( + returns_df, + factors, + returns_data=True, + model="ff5", + compounding=False, + frequency=1, + ) + + expected = ( + factors["RF"].mean() + + alphas + + betas @ factors[factor_cols].mean().to_numpy() + ) + + expected = pd.Series( + expected, + index=returns_df.columns, + dtype="float64", + ) + + pd.testing.assert_series_equal( + mu, + expected, + check_exact=False, + rtol=1e-10, + atol=1e-12, + ) + + +def test_ff_return_returns_data(): + prices, _, factors = _make_ff_test_data(model="ff3") + + returns_df = expected_returns.returns_from_prices(prices) + + mu_from_prices = expected_returns.ff_return( + prices, + factors, + model="ff3", + ) + + mu_from_returns = expected_returns.ff_return( + returns_df, + factors, + model="ff3", + returns_data=True, + ) + + pd.testing.assert_series_equal(mu_from_prices, mu_from_returns) + + +def test_ff_return_compounding_branch(): + returns_df, factors, alphas, betas, factor_cols = _make_ff_known_data(model="ff3") + + mu = expected_returns.ff_return( + returns_df, + factors, + returns_data=True, + model="ff3", + compounding=True, + frequency=252, + ) + + expected_period_return = ( + factors["RF"].mean() + + alphas + + betas @ factors[factor_cols].mean().to_numpy() + ) + expected = pd.Series( + (1 + expected_period_return) ** 252 - 1, + index=returns_df.columns, + dtype="float64", + ) + + pd.testing.assert_series_equal(mu, expected, check_exact=False, rtol=1e-10, atol=1e-12) + + +def test_ff_return_ignores_extra_factor_columns(): + prices, _, factors = _make_ff_test_data(model="ff3") + factors = factors.copy() + factors["Unused"] = 123.456 + + mu_with_extra = expected_returns.ff_return(prices, factors, model="ff3") + mu_without_extra = expected_returns.ff_return( + prices, factors.drop(columns=["Unused"]), model="ff3" + ) + + pd.testing.assert_series_equal(mu_with_extra, mu_without_extra)