update abhaengigkeit

This commit is contained in:
YannAhlgrim
2025-06-10 16:36:47 +02:00
parent 470b07b3a3
commit 7bb2c6535d
@@ -45,7 +45,7 @@
" Anzahl starker Bremsvorgänge mit hoher Verzögerung (>0,3g). Modellierung z.B. als Poisson-verteilte Zufallsvariable mit geringem Mittelwert (wenige Ereignisse pro 100km). Harte Bremsmanöver korrelieren mit aggressiver Fahrweise und sind prädiktiv für Fahrerwechsel. \n",
" → Quelle: [Tesla Safety Score Definition](https://www.findmyelectric.com/blog/tesla-safety-score-beta-explained)\n",
"\n",
"4. **Geschwindigkeitsüberschreitungen (ordinal):** \n",
"4. **Geschwindigkeitsüberschreitungen (ordinal, 3 Kategorien):** \n",
" Häufigkeit bzw. Ausmaß, mit dem Tempolimits überschritten werden. Kategorisierung z.B. in: \n",
" - *selten*, \n",
" - *manchmal*, \n",
@@ -58,24 +58,26 @@
"\n",
"#### Nicht erklärende Variablen:\n",
"\n",
"5. **Straßentyp (nominal, 3 Kategorien):** \n",
" - *Autobahn* (~3033%) \n",
" - *Außerorts* (~43%) \n",
" - *Innerorts* (~2427%) \n",
" → Modelliert als Multinomialverteilung. Kontextvariable ohne direkte Prädiktionswirkung. \n",
" → Quelle: [Klimaschutzinstrumente im Verkehr](https://openumwelt.de/server/api/core/bitstreams/07bd23f9-ff0a-41e5-b89b-8d7baf0a5c36/content)\n",
"\n",
"6. **Wetterbedingungen (nominal, 3 Kategorien):** \n",
"5. **Wetterbedingungen (nominal, 3 Kategorien):** \n",
" - *trocken* (~7075%) \n",
" - *nass* (~2025%) \n",
" - *winterlich* (<5%) \n",
" → Modelliert als Multinomial. Kontextvariable. \n",
" Die Einteilung basiert auf Daten des Statistischen Bundesamtes: Etwa 7075% aller Fahrten finden unter trockenen Bedingungen statt, 2025% bei Nässe und unter 5% bei Schnee oder Glätte.\n",
"\n",
"7. **Fahrstrecke (metrisch):** \n",
"6. **Fahrstrecke (metrisch):** \n",
" Länge der Fahrt (z.B. lognormalverteilt um 12km). Kontextvariable, nicht direkt erklärend. \n",
" → Quelle: [Mobilität in Deutschland 2017 Ergebnisbericht](https://www.bmv.de/SharedDocs/DE/Anlage/G/mid-ergebnisbericht.pdf)\n",
"\n",
" \n",
"7. **Straßentyp (nominal, 3 Kategorien):** \n",
" - *Autobahn* (~3033%) \n",
" - *Außerorts* (~43%) \n",
" - *Innerorts* (~2427%) \n",
" → Modelliert als Multinomialverteilung. Kontextvariable ohne direkte Prädiktionswirkung. \n",
" → Quelle: [Klimaschutzinstrumente im Verkehr](https://openumwelt.de/server/api/core/bitstreams/07bd23f9-ff0a-41e5-b89b-8d7baf0a5c36/content)\n",
"\n",
"8. **Wochentag (nominal, 23 Kategorien):** \n",
" Z.B. *Werktag* vs *Wochenende* oder *MoFr*, *Sa*, *So*. Modellierung als kategorische Variable. Kontextvariable. \n",
" → Quelle: [Mobilität in Deutschland 2017 Ergebnisbericht](https://www.bmv.de/SharedDocs/DE/Anlage/G/mid-ergebnisbericht.pdf)\n"
@@ -91,7 +93,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 50,
"id": "97ba29a9",
"metadata": {},
"outputs": [],
@@ -102,6 +104,8 @@
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from scipy.stats import norm, multinomial, poisson, lognorm, bernoulli\n",
"import statsmodels.api as sm\n",
"from statsmodels.stats.outliers_influence import variance_inflation_factor\n",
"\n",
"np.random.seed(11)\n",
"N = 1_000_000"
@@ -117,7 +121,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 51,
"id": "61247da5",
"metadata": {},
"outputs": [],
@@ -131,21 +135,40 @@
"# 3. Harte Bremsmanöver (metrisch, Poisson)\n",
"hard_brakes = np.random.poisson(lam=2, size=N)\n",
"\n",
"# 4. Geschwindigkeitsüberschreitungen (ordinal, 3 Kategorien)\n",
"speeding = np.random.choice(['selten', 'manchmal', 'häufig'], size=N, p=[0.6, 0.3, 0.1])\n",
"# 4. Geschwindigkeitsüberschreitungen (ordinal, 3 Kategorien) Abhängigkeit von avg_speed\n",
"def softmax(x):\n",
" e_x = np.exp(x - np.max(x, axis=1, keepdims=True))\n",
" return e_x / e_x.sum(axis=1, keepdims=True)\n",
"\n",
"# 5. Straßentyp (nominal, Kontext)\n",
"road_type = np.random.choice(['Autobahn', 'Außerorts', 'Innerorts'], size=N, p=[0.32, 0.43, 0.25])\n",
"speed_scaled = (avg_speed - avg_speed.min()) / (avg_speed.max() - avg_speed.min())\n",
"strength = 0.01\n",
"logits = np.zeros((N, 3))\n",
"logits[:, 0] = 1 - strength * speed_scaled # selten\n",
"logits[:, 1] = 1 # manchmal\n",
"logits[:, 2] = 1 + strength * speed_scaled # häufig\n",
"probs = softmax(logits)\n",
"speeding = [np.random.choice(['selten', 'manchmal', 'häufig'], p=p) for p in probs]\n",
"\n",
"# 6. Wetterbedingungen (nominal, Kontext)\n",
"# 5. Wetterbedingungen (nominal, Kontext)\n",
"weather = np.random.choice(['trocken', 'nass', 'winterlich'], size=N, p=[0.75, 0.2, 0.05])\n",
"\n",
"# 7. Fahrstrecke (metrisch, lognormal)\n",
"# 6. Fahrstrecke (metrisch, lognormal)\n",
"mu, sigma = np.log(12), 0.7\n",
"trip_distance = np.random.lognormal(mean=mu, sigma=sigma, size=N)\n",
"\n",
"# 7. Straßentyp (nominal, Kontext) schwächere Abhängigkeit von trip_distance\n",
"road_probs = []\n",
"for dist in trip_distance:\n",
" if dist < 5:\n",
" road_probs.append([0.5, 0.35, 0.15])\n",
" elif dist < 20:\n",
" road_probs.append([0.2, 0.5, 0.3])\n",
" else:\n",
" road_probs.append([0.1, 0.4, 0.5])\n",
"road_type = [np.random.choice(['Innerorts', 'Außerorts', 'Autobahn'], p=prob) for prob in road_probs]\n",
"\n",
"# 8. Wochentag (nominal, Kontext)\n",
"weekday = np.random.choice(['Mo-Fr', 'Sa', 'So'], size=N, p=[0.7, 0.15, 0.15])\n"
"weekday = np.random.choice(['Mo-Fr', 'Sa', 'So'], size=N, p=[0.7, 0.15, 0.15])"
]
},
{
@@ -158,29 +181,183 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 52,
"id": "f8e0efb0",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>avg_speed</th>\n",
" <th>shift_behavior</th>\n",
" <th>hard_brakes</th>\n",
" <th>speeding</th>\n",
" <th>weather</th>\n",
" <th>trip_distance</th>\n",
" <th>road_type</th>\n",
" <th>weekday</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>64.494547</td>\n",
" <td>früh</td>\n",
" <td>2</td>\n",
" <td>manchmal</td>\n",
" <td>trocken</td>\n",
" <td>30.449962</td>\n",
" <td>Außerorts</td>\n",
" <td>Mo-Fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>44.139270</td>\n",
" <td>früh</td>\n",
" <td>3</td>\n",
" <td>häufig</td>\n",
" <td>trocken</td>\n",
" <td>11.911103</td>\n",
" <td>Autobahn</td>\n",
" <td>Mo-Fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>42.154349</td>\n",
" <td>spät</td>\n",
" <td>2</td>\n",
" <td>selten</td>\n",
" <td>trocken</td>\n",
" <td>6.086055</td>\n",
" <td>Autobahn</td>\n",
" <td>Mo-Fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20.466814</td>\n",
" <td>normal</td>\n",
" <td>5</td>\n",
" <td>häufig</td>\n",
" <td>trocken</td>\n",
" <td>5.732684</td>\n",
" <td>Außerorts</td>\n",
" <td>Mo-Fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>46.917154</td>\n",
" <td>spät</td>\n",
" <td>2</td>\n",
" <td>selten</td>\n",
" <td>trocken</td>\n",
" <td>7.256758</td>\n",
" <td>Außerorts</td>\n",
" <td>Mo-Fr</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" avg_speed shift_behavior hard_brakes speeding weather trip_distance \\\n",
"0 64.494547 früh 2 manchmal trocken 30.449962 \n",
"1 44.139270 früh 3 häufig trocken 11.911103 \n",
"2 42.154349 spät 2 selten trocken 6.086055 \n",
"3 20.466814 normal 5 häufig trocken 5.732684 \n",
"4 46.917154 spät 2 selten trocken 7.256758 \n",
"\n",
" road_type weekday \n",
"0 Außerorts Mo-Fr \n",
"1 Autobahn Mo-Fr \n",
"2 Autobahn Mo-Fr \n",
"3 Außerorts Mo-Fr \n",
"4 Außerorts Mo-Fr "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\n",
" 'avg_speed': avg_speed,\n",
" 'shift_behavior': shift_behavior,\n",
" 'hard_brakes': hard_brakes,\n",
" 'speeding': speeding,\n",
" 'road_type': road_type,\n",
" 'weather': weather,\n",
" 'trip_distance': trip_distance,\n",
" 'road_type': road_type,\n",
" 'weekday': weekday\n",
"})\n",
"\n",
"df.head()\n"
]
},
{
"cell_type": "markdown",
"id": "f06e7c4d",
"metadata": {},
"source": [
"### VIF der Abhängigkeiten prüfen"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "6bfb199a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Variance Inflation Factor (VIF) für ausgewählte Variabeln:\n",
" features VIF Factor\n",
"0 avg_speed 5.768491\n",
"1 trip_distance 2.469313\n",
"2 road_type_Außerorts 2.238422\n",
"3 speeding_manchmal 1.900947\n",
"4 speeding_selten 1.893981\n",
"5 road_type_Innerorts 1.587059\n"
]
}
],
"source": [
"vif = pd.DataFrame()\n",
"columns = ['avg_speed', 'speeding', 'trip_distance', 'road_type']\n",
"vif_df = pd.get_dummies(df[columns], drop_first=True)\n",
"vif_df = vif_df.astype(float)\n",
"vif['features'] = vif_df.columns\n",
"vif['VIF Factor'] = [variance_inflation_factor(vif_df.values, i) for i in range(vif_df.shape[1])]\n",
"vif = vif.sort_values('VIF Factor', ascending=False).reset_index(drop=True)\n",
"print(\"\\nVariance Inflation Factor (VIF) für ausgewählte Variabeln:\")\n",
"print(vif)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"display_name": "hft_ml",
"language": "python",
"name": "python3"
},
@@ -194,7 +371,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.11.10"
}
},
"nbformat": 4,