From 4091f3a44f0c879b034ac4d95adbb2055692ff05 Mon Sep 17 00:00:00 2001 From: devthejo Date: Sun, 8 Mar 2026 14:50:47 +0100 Subject: [PATCH] fix(dae): csv address duplication + data normalizaition issues --- scripts/dae/geodae-to-csv.js | 143 ++++++++++++++++++++++++++++++++--- 1 file changed, 132 insertions(+), 11 deletions(-) diff --git a/scripts/dae/geodae-to-csv.js b/scripts/dae/geodae-to-csv.js index 054f66a..3e838d1 100644 --- a/scripts/dae/geodae-to-csv.js +++ b/scripts/dae/geodae-to-csv.js @@ -14,9 +14,9 @@ const OUTPUT = join(__dirname, "geodae.csv"); function escapeCsv(value) { if (value == null) return ""; - // Replace newlines with spaces to keep one row per entry + // Replace newlines and tabs with spaces to keep one row per entry const str = String(value) - .replace(/[\r\n]+/g, " ") + .replace(/[\r\n\t]+/g, " ") .trim(); if (str.includes('"') || str.includes(",")) { return '"' + str.replace(/"/g, '""') + '"'; @@ -140,7 +140,7 @@ function is24h(arr) { function buildHoraires(p) { const days = formatDays(p.c_disp_j); const hours = formatHours(p.c_disp_h); - const complt = (p.c_disp_complt || "").replace(/[\r\n]+/g, " ").trim(); + const complt = (p.c_disp_complt || "").replace(/[\r\n\t]+/g, " ").trim(); if (!complt) { // No complement: just days + hours @@ -174,15 +174,52 @@ function buildHoraires(p) { function formatAddress(p) { const parts = []; - const num = (p.c_adr_num || "").trim(); - const street = (p.c_adr_voie || "").trim(); + let num = (p.c_adr_num || "").trim(); + let street = (p.c_adr_voie || "") + .split("\t")[0] // strip tab-separated cp/city embedded in the field + .split("|")[0] // strip pipe-separated cp/city embedded in the field + .trim(); + + // Drop invalid numbers: placeholders, decimals, letters, etc. + // Valid street numbers: digits with optional dash/slash/space separators (e.g. "62", "62-64", "10 12") + if (!/^\d[\d\s\-/]*$/.test(num)) num = ""; + + const cp = (p.c_com_cp || "").trim(); + + // Drop num when it equals the postal code (data entry mistake) + if (num && num === cp) num = ""; + // Strip parenthesized cp from city name, e.g. "GANAC (09000)" → "GANAC" + let city = (p.c_com_nom || "").trim(); + if (cp && city) { + city = city.replace(new RegExp("\\s*\\(" + cp.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\)"), "").trim(); + } + + // Strip cp+city already embedded in street field + // e.g. "Mont Salomon 38200 Vienne" or "62117 rue de Lambres" when cp matches + if (cp && street.includes(cp)) { + const cpEscaped = cp.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + // Trailing: "street 38200 Vienne" → "street" + street = street.replace(new RegExp("\\s+" + cpEscaped + "\\s+.*$"), "").trim(); + // Leading: "62117 rue de Lambres" → "rue de Lambres" + street = street.replace(new RegExp("^" + cpEscaped + "\\s+"), "").trim(); + } + if (num && street) { - parts.push(num + " " + street); + // Avoid duplicated number when street already starts with the same number + // Handles plain "62 Rue…", ranges "62-64 Rue…", and slashes "62/64 Rue…" + const numEscaped = num.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + const alreadyHasNum = new RegExp("^" + numEscaped + "(?!\\d)").test(street); + if (alreadyHasNum) { + parts.push(street); + } else { + parts.push(num + " " + street); + } } else if (street) { parts.push(street); + } else if (num) { + parts.push(num); } - const cp = (p.c_com_cp || "").trim(); - const city = (p.c_com_nom || "").trim(); + if (cp && city) { parts.push(cp + " " + city); } else if (city) { @@ -250,6 +287,83 @@ function passesFilter(p) { return true; } +/** + * Check if coordinates fall in a plausible French territory. + */ +function isPlausibleFrance(lat, lon) { + if (Math.abs(lat) > 90 || Math.abs(lon) > 180) return false; + // Metropolitan France + if (lat >= 41 && lat <= 52 && lon >= -6 && lon <= 11) return true; + // La Réunion + if (lat >= -22 && lat <= -20 && lon >= 54 && lon <= 57) return true; + // Mayotte + if (lat >= -14 && lat <= -12 && lon >= 44 && lon <= 46) return true; + // Guadeloupe / Martinique / Saint-Martin / Saint-Barthélemy + if (lat >= 14 && lat <= 18 && lon >= -64 && lon <= -60) return true; + // Guyane + if (lat >= 2 && lat <= 6 && lon >= -55 && lon <= -51) return true; + // Nouvelle-Calédonie + if (lat >= -23 && lat <= -19 && lon >= 163 && lon <= 169) return true; + // Polynésie française + if (lat >= -28 && lat <= -7 && lon >= -155 && lon <= -130) return true; + // Saint-Pierre-et-Miquelon + if (lat >= 46 && lat <= 48 && lon >= -57 && lon <= -55) return true; + // Wallis-et-Futuna + if (lat >= -15 && lat <= -13 && lon >= -179 && lon <= -176) return true; + // TAAF (Kerguelen, Crozet, Amsterdam, etc.) + if (lat >= -50 && lat <= -37 && lon >= 50 && lon <= 78) return true; + // Clipperton + if (lat >= 10 && lat <= 11 && lon >= -110 && lon <= -108) return true; + return false; +} + +/** + * Try to fix an out-of-range coordinate by dividing by powers of 10. + * Returns the fixed value if it falls in [minValid, maxValid], else null. + */ +function tryNormalizeCoord(val, limit) { + if (Math.abs(val) <= limit) return val; + let v = val; + while (Math.abs(v) > limit) { + v /= 10; + } + return v; +} + +/** + * Attempt to produce valid WGS84 coordinates from potentially garbled input. + * Strategy: + * 1. Use properties directly if valid + * 2. Fall back to GeoJSON geometry (standard [lon, lat] then swapped) + * 3. Try power-of-10 normalization for misplaced decimals + */ +function fixCoordinates(lat, lon, geometry) { + // 1. Already valid WGS84 — trust the source as-is + if (Math.abs(lat) <= 90 && Math.abs(lon) <= 180) return { lat, lon }; + + // Out of WGS84 range — try to recover using fallbacks + plausibility check + + // 2. Try GeoJSON geometry + if (geometry && geometry.coordinates) { + let coords = geometry.coordinates; + // Flatten nested arrays (MultiPoint, etc.) + while (Array.isArray(coords[0])) coords = coords[0]; + if (coords.length === 2) { + const [gLon, gLat] = coords; // GeoJSON = [lon, lat] + if (isPlausibleFrance(gLat, gLon)) return { lat: gLat, lon: gLon }; + // Try swapped (some entries have lat/lon inverted in geometry) + if (isPlausibleFrance(gLon, gLat)) return { lat: gLon, lon: gLat }; + } + } + + // 3. Try power-of-10 normalization for misplaced decimals + const fixedLat = tryNormalizeCoord(lat, 90); + const fixedLon = tryNormalizeCoord(lon, 180); + if (isPlausibleFrance(fixedLat, fixedLon)) return { lat: fixedLat, lon: fixedLon }; + + return null; +} + // --- Main --- console.log("Reading geodae.json..."); @@ -281,13 +395,20 @@ for (const feature of features) { continue; } - const lat = p.c_lat_coor1; - const lon = p.c_long_coor1; - if (lat == null || lon == null) { + const rawLat = p.c_lat_coor1; + const rawLon = p.c_long_coor1; + if (rawLat == null || rawLon == null) { filtered++; continue; } + const fixed = fixCoordinates(rawLat, rawLon, feature.geometry); + if (!fixed) { + filtered++; + continue; + } + const { lat, lon } = fixed; + const always = isAlwaysAvailable(p); if (always) alwaysCount++;