diff --git a/1_data_science/simulationsstudie/yann_ahlgrim_simulationsstudie.ipynb b/1_data_science/simulationsstudie/yann_ahlgrim_simulationsstudie.ipynb index e8f9d19..1d1429c 100644 --- a/1_data_science/simulationsstudie/yann_ahlgrim_simulationsstudie.ipynb +++ b/1_data_science/simulationsstudie/yann_ahlgrim_simulationsstudie.ipynb @@ -45,7 +45,7 @@ " Anzahl starker Bremsvorgänge mit hoher Verzögerung (>0,3 g). Modellierung z.B. als Poisson-verteilte Zufallsvariable mit geringem Mittelwert (wenige Ereignisse pro 100 km). Harte Bremsmanöver korrelieren mit aggressiver Fahrweise und sind prädiktiv für Fahrerwechsel. \n", " → Quelle: [Tesla Safety Score Definition](https://www.findmyelectric.com/blog/tesla-safety-score-beta-explained)\n", "\n", - "4. **Geschwindigkeitsüberschreitungen (ordinal):** \n", + "4. **Geschwindigkeitsüberschreitungen (ordinal, 3 Kategorien):** \n", " Häufigkeit bzw. Ausmaß, mit dem Tempolimits überschritten werden. Kategorisierung z.B. in: \n", " - *selten*, \n", " - *manchmal*, \n", @@ -58,24 +58,26 @@ "\n", "#### Nicht erklärende Variablen:\n", "\n", - "5. **Straßentyp (nominal, 3 Kategorien):** \n", - " - *Autobahn* (~30–33 %) \n", - " - *Außerorts* (~43 %) \n", - " - *Innerorts* (~24–27 %) \n", - " → Modelliert als Multinomialverteilung. Kontextvariable ohne direkte Prädiktionswirkung. \n", - " → Quelle: [Klimaschutzinstrumente im Verkehr](https://openumwelt.de/server/api/core/bitstreams/07bd23f9-ff0a-41e5-b89b-8d7baf0a5c36/content)\n", "\n", - "6. **Wetterbedingungen (nominal, 3 Kategorien):** \n", + "5. **Wetterbedingungen (nominal, 3 Kategorien):** \n", " - *trocken* (~70–75 %) \n", " - *nass* (~20–25 %) \n", " - *winterlich* (<5 %) \n", " → Modelliert als Multinomial. Kontextvariable. \n", " Die Einteilung basiert auf Daten des Statistischen Bundesamtes: Etwa 70–75 % aller Fahrten finden unter trockenen Bedingungen statt, 20–25 % bei Nässe und unter 5 % bei Schnee oder Glätte.\n", "\n", - "7. **Fahrstrecke (metrisch):** \n", + "6. **Fahrstrecke (metrisch):** \n", " Länge der Fahrt (z.B. lognormalverteilt um 12 km). Kontextvariable, nicht direkt erklärend. \n", " → Quelle: [Mobilität in Deutschland 2017 – Ergebnisbericht](https://www.bmv.de/SharedDocs/DE/Anlage/G/mid-ergebnisbericht.pdf)\n", "\n", + " \n", + "7. **Straßentyp (nominal, 3 Kategorien):** \n", + " - *Autobahn* (~30–33 %) \n", + " - *Außerorts* (~43 %) \n", + " - *Innerorts* (~24–27 %) \n", + " → Modelliert als Multinomialverteilung. Kontextvariable ohne direkte Prädiktionswirkung. \n", + " → Quelle: [Klimaschutzinstrumente im Verkehr](https://openumwelt.de/server/api/core/bitstreams/07bd23f9-ff0a-41e5-b89b-8d7baf0a5c36/content)\n", + "\n", "8. **Wochentag (nominal, 2–3 Kategorien):** \n", " Z.B. *Werktag* vs *Wochenende* oder *Mo–Fr*, *Sa*, *So*. Modellierung als kategorische Variable. Kontextvariable. \n", " → Quelle: [Mobilität in Deutschland 2017 – Ergebnisbericht](https://www.bmv.de/SharedDocs/DE/Anlage/G/mid-ergebnisbericht.pdf)\n" @@ -91,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "id": "97ba29a9", "metadata": {}, "outputs": [], @@ -102,6 +104,8 @@ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from scipy.stats import norm, multinomial, poisson, lognorm, bernoulli\n", + "import statsmodels.api as sm\n", + "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", "\n", "np.random.seed(11)\n", "N = 1_000_000" @@ -117,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "id": "61247da5", "metadata": {}, "outputs": [], @@ -131,21 +135,40 @@ "# 3. Harte Bremsmanöver (metrisch, Poisson)\n", "hard_brakes = np.random.poisson(lam=2, size=N)\n", "\n", - "# 4. Geschwindigkeitsüberschreitungen (ordinal, 3 Kategorien)\n", - "speeding = np.random.choice(['selten', 'manchmal', 'häufig'], size=N, p=[0.6, 0.3, 0.1])\n", + "# 4. Geschwindigkeitsüberschreitungen (ordinal, 3 Kategorien) – Abhängigkeit von avg_speed\n", + "def softmax(x):\n", + " e_x = np.exp(x - np.max(x, axis=1, keepdims=True))\n", + " return e_x / e_x.sum(axis=1, keepdims=True)\n", "\n", - "# 5. Straßentyp (nominal, Kontext)\n", - "road_type = np.random.choice(['Autobahn', 'Außerorts', 'Innerorts'], size=N, p=[0.32, 0.43, 0.25])\n", + "speed_scaled = (avg_speed - avg_speed.min()) / (avg_speed.max() - avg_speed.min())\n", + "strength = 0.01\n", + "logits = np.zeros((N, 3))\n", + "logits[:, 0] = 1 - strength * speed_scaled # selten\n", + "logits[:, 1] = 1 # manchmal\n", + "logits[:, 2] = 1 + strength * speed_scaled # häufig\n", + "probs = softmax(logits)\n", + "speeding = [np.random.choice(['selten', 'manchmal', 'häufig'], p=p) for p in probs]\n", "\n", - "# 6. Wetterbedingungen (nominal, Kontext)\n", + "# 5. Wetterbedingungen (nominal, Kontext)\n", "weather = np.random.choice(['trocken', 'nass', 'winterlich'], size=N, p=[0.75, 0.2, 0.05])\n", "\n", - "# 7. Fahrstrecke (metrisch, lognormal)\n", + "# 6. Fahrstrecke (metrisch, lognormal)\n", "mu, sigma = np.log(12), 0.7\n", "trip_distance = np.random.lognormal(mean=mu, sigma=sigma, size=N)\n", "\n", + "# 7. Straßentyp (nominal, Kontext) – schwächere Abhängigkeit von trip_distance\n", + "road_probs = []\n", + "for dist in trip_distance:\n", + " if dist < 5:\n", + " road_probs.append([0.5, 0.35, 0.15])\n", + " elif dist < 20:\n", + " road_probs.append([0.2, 0.5, 0.3])\n", + " else:\n", + " road_probs.append([0.1, 0.4, 0.5])\n", + "road_type = [np.random.choice(['Innerorts', 'Außerorts', 'Autobahn'], p=prob) for prob in road_probs]\n", + "\n", "# 8. Wochentag (nominal, Kontext)\n", - "weekday = np.random.choice(['Mo-Fr', 'Sa', 'So'], size=N, p=[0.7, 0.15, 0.15])\n" + "weekday = np.random.choice(['Mo-Fr', 'Sa', 'So'], size=N, p=[0.7, 0.15, 0.15])" ] }, { @@ -158,29 +181,183 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "id": "f8e0efb0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
avg_speedshift_behaviorhard_brakesspeedingweathertrip_distanceroad_typeweekday
064.494547früh2manchmaltrocken30.449962AußerortsMo-Fr
144.139270früh3häufigtrocken11.911103AutobahnMo-Fr
242.154349spät2seltentrocken6.086055AutobahnMo-Fr
320.466814normal5häufigtrocken5.732684AußerortsMo-Fr
446.917154spät2seltentrocken7.256758AußerortsMo-Fr
\n", + "
" + ], + "text/plain": [ + " avg_speed shift_behavior hard_brakes speeding weather trip_distance \\\n", + "0 64.494547 früh 2 manchmal trocken 30.449962 \n", + "1 44.139270 früh 3 häufig trocken 11.911103 \n", + "2 42.154349 spät 2 selten trocken 6.086055 \n", + "3 20.466814 normal 5 häufig trocken 5.732684 \n", + "4 46.917154 spät 2 selten trocken 7.256758 \n", + "\n", + " road_type weekday \n", + "0 Außerorts Mo-Fr \n", + "1 Autobahn Mo-Fr \n", + "2 Autobahn Mo-Fr \n", + "3 Außerorts Mo-Fr \n", + "4 Außerorts Mo-Fr " + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = pd.DataFrame({\n", " 'avg_speed': avg_speed,\n", " 'shift_behavior': shift_behavior,\n", " 'hard_brakes': hard_brakes,\n", " 'speeding': speeding,\n", - " 'road_type': road_type,\n", " 'weather': weather,\n", " 'trip_distance': trip_distance,\n", + " 'road_type': road_type,\n", " 'weekday': weekday\n", "})\n", "\n", "df.head()\n" ] + }, + { + "cell_type": "markdown", + "id": "f06e7c4d", + "metadata": {}, + "source": [ + "### VIF der Abhängigkeiten prüfen" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "6bfb199a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Variance Inflation Factor (VIF) für ausgewählte Variabeln:\n", + " features VIF Factor\n", + "0 avg_speed 5.768491\n", + "1 trip_distance 2.469313\n", + "2 road_type_Außerorts 2.238422\n", + "3 speeding_manchmal 1.900947\n", + "4 speeding_selten 1.893981\n", + "5 road_type_Innerorts 1.587059\n" + ] + } + ], + "source": [ + "vif = pd.DataFrame()\n", + "columns = ['avg_speed', 'speeding', 'trip_distance', 'road_type']\n", + "vif_df = pd.get_dummies(df[columns], drop_first=True)\n", + "vif_df = vif_df.astype(float)\n", + "vif['features'] = vif_df.columns\n", + "vif['VIF Factor'] = [variance_inflation_factor(vif_df.values, i) for i in range(vif_df.shape[1])]\n", + "vif = vif.sort_values('VIF Factor', ascending=False).reset_index(drop=True)\n", + "print(\"\\nVariance Inflation Factor (VIF) für ausgewählte Variabeln:\")\n", + "print(vif)" + ] } ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "hft_ml", "language": "python", "name": "python3" }, @@ -194,7 +371,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.10" } }, "nbformat": 4,