update abhaengigkeit

This commit is contained in:
YannAhlgrim
2025-06-10 16:36:47 +02:00
parent 470b07b3a3
commit 7bb2c6535d
@@ -45,7 +45,7 @@
" Anzahl starker Bremsvorgänge mit hoher Verzögerung (>0,3g). Modellierung z.B. als Poisson-verteilte Zufallsvariable mit geringem Mittelwert (wenige Ereignisse pro 100km). Harte Bremsmanöver korrelieren mit aggressiver Fahrweise und sind prädiktiv für Fahrerwechsel. \n", " Anzahl starker Bremsvorgänge mit hoher Verzögerung (>0,3g). Modellierung z.B. als Poisson-verteilte Zufallsvariable mit geringem Mittelwert (wenige Ereignisse pro 100km). Harte Bremsmanöver korrelieren mit aggressiver Fahrweise und sind prädiktiv für Fahrerwechsel. \n",
" → Quelle: [Tesla Safety Score Definition](https://www.findmyelectric.com/blog/tesla-safety-score-beta-explained)\n", " → Quelle: [Tesla Safety Score Definition](https://www.findmyelectric.com/blog/tesla-safety-score-beta-explained)\n",
"\n", "\n",
"4. **Geschwindigkeitsüberschreitungen (ordinal):** \n", "4. **Geschwindigkeitsüberschreitungen (ordinal, 3 Kategorien):** \n",
" Häufigkeit bzw. Ausmaß, mit dem Tempolimits überschritten werden. Kategorisierung z.B. in: \n", " Häufigkeit bzw. Ausmaß, mit dem Tempolimits überschritten werden. Kategorisierung z.B. in: \n",
" - *selten*, \n", " - *selten*, \n",
" - *manchmal*, \n", " - *manchmal*, \n",
@@ -58,24 +58,26 @@
"\n", "\n",
"#### Nicht erklärende Variablen:\n", "#### Nicht erklärende Variablen:\n",
"\n", "\n",
"5. **Straßentyp (nominal, 3 Kategorien):** \n",
" - *Autobahn* (~3033%) \n",
" - *Außerorts* (~43%) \n",
" - *Innerorts* (~2427%) \n",
" → Modelliert als Multinomialverteilung. Kontextvariable ohne direkte Prädiktionswirkung. \n",
" → Quelle: [Klimaschutzinstrumente im Verkehr](https://openumwelt.de/server/api/core/bitstreams/07bd23f9-ff0a-41e5-b89b-8d7baf0a5c36/content)\n",
"\n", "\n",
"6. **Wetterbedingungen (nominal, 3 Kategorien):** \n", "5. **Wetterbedingungen (nominal, 3 Kategorien):** \n",
" - *trocken* (~7075%) \n", " - *trocken* (~7075%) \n",
" - *nass* (~2025%) \n", " - *nass* (~2025%) \n",
" - *winterlich* (<5%) \n", " - *winterlich* (<5%) \n",
" → Modelliert als Multinomial. Kontextvariable. \n", " → Modelliert als Multinomial. Kontextvariable. \n",
" Die Einteilung basiert auf Daten des Statistischen Bundesamtes: Etwa 7075% aller Fahrten finden unter trockenen Bedingungen statt, 2025% bei Nässe und unter 5% bei Schnee oder Glätte.\n", " Die Einteilung basiert auf Daten des Statistischen Bundesamtes: Etwa 7075% aller Fahrten finden unter trockenen Bedingungen statt, 2025% bei Nässe und unter 5% bei Schnee oder Glätte.\n",
"\n", "\n",
"7. **Fahrstrecke (metrisch):** \n", "6. **Fahrstrecke (metrisch):** \n",
" Länge der Fahrt (z.B. lognormalverteilt um 12km). Kontextvariable, nicht direkt erklärend. \n", " Länge der Fahrt (z.B. lognormalverteilt um 12km). Kontextvariable, nicht direkt erklärend. \n",
" → Quelle: [Mobilität in Deutschland 2017 Ergebnisbericht](https://www.bmv.de/SharedDocs/DE/Anlage/G/mid-ergebnisbericht.pdf)\n", " → Quelle: [Mobilität in Deutschland 2017 Ergebnisbericht](https://www.bmv.de/SharedDocs/DE/Anlage/G/mid-ergebnisbericht.pdf)\n",
"\n", "\n",
" \n",
"7. **Straßentyp (nominal, 3 Kategorien):** \n",
" - *Autobahn* (~3033%) \n",
" - *Außerorts* (~43%) \n",
" - *Innerorts* (~2427%) \n",
" → Modelliert als Multinomialverteilung. Kontextvariable ohne direkte Prädiktionswirkung. \n",
" → Quelle: [Klimaschutzinstrumente im Verkehr](https://openumwelt.de/server/api/core/bitstreams/07bd23f9-ff0a-41e5-b89b-8d7baf0a5c36/content)\n",
"\n",
"8. **Wochentag (nominal, 23 Kategorien):** \n", "8. **Wochentag (nominal, 23 Kategorien):** \n",
" Z.B. *Werktag* vs *Wochenende* oder *MoFr*, *Sa*, *So*. Modellierung als kategorische Variable. Kontextvariable. \n", " Z.B. *Werktag* vs *Wochenende* oder *MoFr*, *Sa*, *So*. Modellierung als kategorische Variable. Kontextvariable. \n",
" → Quelle: [Mobilität in Deutschland 2017 Ergebnisbericht](https://www.bmv.de/SharedDocs/DE/Anlage/G/mid-ergebnisbericht.pdf)\n" " → Quelle: [Mobilität in Deutschland 2017 Ergebnisbericht](https://www.bmv.de/SharedDocs/DE/Anlage/G/mid-ergebnisbericht.pdf)\n"
@@ -91,7 +93,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 50,
"id": "97ba29a9", "id": "97ba29a9",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -102,6 +104,8 @@
"import matplotlib.pyplot as plt\n", "import matplotlib.pyplot as plt\n",
"import seaborn as sns\n", "import seaborn as sns\n",
"from scipy.stats import norm, multinomial, poisson, lognorm, bernoulli\n", "from scipy.stats import norm, multinomial, poisson, lognorm, bernoulli\n",
"import statsmodels.api as sm\n",
"from statsmodels.stats.outliers_influence import variance_inflation_factor\n",
"\n", "\n",
"np.random.seed(11)\n", "np.random.seed(11)\n",
"N = 1_000_000" "N = 1_000_000"
@@ -117,7 +121,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 51,
"id": "61247da5", "id": "61247da5",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -131,21 +135,40 @@
"# 3. Harte Bremsmanöver (metrisch, Poisson)\n", "# 3. Harte Bremsmanöver (metrisch, Poisson)\n",
"hard_brakes = np.random.poisson(lam=2, size=N)\n", "hard_brakes = np.random.poisson(lam=2, size=N)\n",
"\n", "\n",
"# 4. Geschwindigkeitsüberschreitungen (ordinal, 3 Kategorien)\n", "# 4. Geschwindigkeitsüberschreitungen (ordinal, 3 Kategorien) Abhängigkeit von avg_speed\n",
"speeding = np.random.choice(['selten', 'manchmal', 'häufig'], size=N, p=[0.6, 0.3, 0.1])\n", "def softmax(x):\n",
" e_x = np.exp(x - np.max(x, axis=1, keepdims=True))\n",
" return e_x / e_x.sum(axis=1, keepdims=True)\n",
"\n", "\n",
"# 5. Straßentyp (nominal, Kontext)\n", "speed_scaled = (avg_speed - avg_speed.min()) / (avg_speed.max() - avg_speed.min())\n",
"road_type = np.random.choice(['Autobahn', 'Außerorts', 'Innerorts'], size=N, p=[0.32, 0.43, 0.25])\n", "strength = 0.01\n",
"logits = np.zeros((N, 3))\n",
"logits[:, 0] = 1 - strength * speed_scaled # selten\n",
"logits[:, 1] = 1 # manchmal\n",
"logits[:, 2] = 1 + strength * speed_scaled # häufig\n",
"probs = softmax(logits)\n",
"speeding = [np.random.choice(['selten', 'manchmal', 'häufig'], p=p) for p in probs]\n",
"\n", "\n",
"# 6. Wetterbedingungen (nominal, Kontext)\n", "# 5. Wetterbedingungen (nominal, Kontext)\n",
"weather = np.random.choice(['trocken', 'nass', 'winterlich'], size=N, p=[0.75, 0.2, 0.05])\n", "weather = np.random.choice(['trocken', 'nass', 'winterlich'], size=N, p=[0.75, 0.2, 0.05])\n",
"\n", "\n",
"# 7. Fahrstrecke (metrisch, lognormal)\n", "# 6. Fahrstrecke (metrisch, lognormal)\n",
"mu, sigma = np.log(12), 0.7\n", "mu, sigma = np.log(12), 0.7\n",
"trip_distance = np.random.lognormal(mean=mu, sigma=sigma, size=N)\n", "trip_distance = np.random.lognormal(mean=mu, sigma=sigma, size=N)\n",
"\n", "\n",
"# 7. Straßentyp (nominal, Kontext) schwächere Abhängigkeit von trip_distance\n",
"road_probs = []\n",
"for dist in trip_distance:\n",
" if dist < 5:\n",
" road_probs.append([0.5, 0.35, 0.15])\n",
" elif dist < 20:\n",
" road_probs.append([0.2, 0.5, 0.3])\n",
" else:\n",
" road_probs.append([0.1, 0.4, 0.5])\n",
"road_type = [np.random.choice(['Innerorts', 'Außerorts', 'Autobahn'], p=prob) for prob in road_probs]\n",
"\n",
"# 8. Wochentag (nominal, Kontext)\n", "# 8. Wochentag (nominal, Kontext)\n",
"weekday = np.random.choice(['Mo-Fr', 'Sa', 'So'], size=N, p=[0.7, 0.15, 0.15])\n" "weekday = np.random.choice(['Mo-Fr', 'Sa', 'So'], size=N, p=[0.7, 0.15, 0.15])"
] ]
}, },
{ {
@@ -158,29 +181,183 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 52,
"id": "f8e0efb0", "id": "f8e0efb0",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>avg_speed</th>\n",
" <th>shift_behavior</th>\n",
" <th>hard_brakes</th>\n",
" <th>speeding</th>\n",
" <th>weather</th>\n",
" <th>trip_distance</th>\n",
" <th>road_type</th>\n",
" <th>weekday</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>64.494547</td>\n",
" <td>früh</td>\n",
" <td>2</td>\n",
" <td>manchmal</td>\n",
" <td>trocken</td>\n",
" <td>30.449962</td>\n",
" <td>Außerorts</td>\n",
" <td>Mo-Fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>44.139270</td>\n",
" <td>früh</td>\n",
" <td>3</td>\n",
" <td>häufig</td>\n",
" <td>trocken</td>\n",
" <td>11.911103</td>\n",
" <td>Autobahn</td>\n",
" <td>Mo-Fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>42.154349</td>\n",
" <td>spät</td>\n",
" <td>2</td>\n",
" <td>selten</td>\n",
" <td>trocken</td>\n",
" <td>6.086055</td>\n",
" <td>Autobahn</td>\n",
" <td>Mo-Fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20.466814</td>\n",
" <td>normal</td>\n",
" <td>5</td>\n",
" <td>häufig</td>\n",
" <td>trocken</td>\n",
" <td>5.732684</td>\n",
" <td>Außerorts</td>\n",
" <td>Mo-Fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>46.917154</td>\n",
" <td>spät</td>\n",
" <td>2</td>\n",
" <td>selten</td>\n",
" <td>trocken</td>\n",
" <td>7.256758</td>\n",
" <td>Außerorts</td>\n",
" <td>Mo-Fr</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" avg_speed shift_behavior hard_brakes speeding weather trip_distance \\\n",
"0 64.494547 früh 2 manchmal trocken 30.449962 \n",
"1 44.139270 früh 3 häufig trocken 11.911103 \n",
"2 42.154349 spät 2 selten trocken 6.086055 \n",
"3 20.466814 normal 5 häufig trocken 5.732684 \n",
"4 46.917154 spät 2 selten trocken 7.256758 \n",
"\n",
" road_type weekday \n",
"0 Außerorts Mo-Fr \n",
"1 Autobahn Mo-Fr \n",
"2 Autobahn Mo-Fr \n",
"3 Außerorts Mo-Fr \n",
"4 Außerorts Mo-Fr "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"df = pd.DataFrame({\n", "df = pd.DataFrame({\n",
" 'avg_speed': avg_speed,\n", " 'avg_speed': avg_speed,\n",
" 'shift_behavior': shift_behavior,\n", " 'shift_behavior': shift_behavior,\n",
" 'hard_brakes': hard_brakes,\n", " 'hard_brakes': hard_brakes,\n",
" 'speeding': speeding,\n", " 'speeding': speeding,\n",
" 'road_type': road_type,\n",
" 'weather': weather,\n", " 'weather': weather,\n",
" 'trip_distance': trip_distance,\n", " 'trip_distance': trip_distance,\n",
" 'road_type': road_type,\n",
" 'weekday': weekday\n", " 'weekday': weekday\n",
"})\n", "})\n",
"\n", "\n",
"df.head()\n" "df.head()\n"
] ]
},
{
"cell_type": "markdown",
"id": "f06e7c4d",
"metadata": {},
"source": [
"### VIF der Abhängigkeiten prüfen"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "6bfb199a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Variance Inflation Factor (VIF) für ausgewählte Variabeln:\n",
" features VIF Factor\n",
"0 avg_speed 5.768491\n",
"1 trip_distance 2.469313\n",
"2 road_type_Außerorts 2.238422\n",
"3 speeding_manchmal 1.900947\n",
"4 speeding_selten 1.893981\n",
"5 road_type_Innerorts 1.587059\n"
]
}
],
"source": [
"vif = pd.DataFrame()\n",
"columns = ['avg_speed', 'speeding', 'trip_distance', 'road_type']\n",
"vif_df = pd.get_dummies(df[columns], drop_first=True)\n",
"vif_df = vif_df.astype(float)\n",
"vif['features'] = vif_df.columns\n",
"vif['VIF Factor'] = [variance_inflation_factor(vif_df.values, i) for i in range(vif_df.shape[1])]\n",
"vif = vif.sort_values('VIF Factor', ascending=False).reset_index(drop=True)\n",
"print(\"\\nVariance Inflation Factor (VIF) für ausgewählte Variabeln:\")\n",
"print(vif)"
]
} }
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "base", "display_name": "hft_ml",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@@ -194,7 +371,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.9" "version": "3.11.10"
} }
}, },
"nbformat": 4, "nbformat": 4,