fix(dae): csv address duplication + data normalizaition issues

This commit is contained in:
devthejo 2026-03-08 14:50:47 +01:00
parent 609ddb47a9
commit 4091f3a44f
No known key found for this signature in database
GPG key ID: 00CCA7A92B1D5351

View file

@ -14,9 +14,9 @@ const OUTPUT = join(__dirname, "geodae.csv");
function escapeCsv(value) {
if (value == null) return "";
// Replace newlines with spaces to keep one row per entry
// Replace newlines and tabs with spaces to keep one row per entry
const str = String(value)
.replace(/[\r\n]+/g, " ")
.replace(/[\r\n\t]+/g, " ")
.trim();
if (str.includes('"') || str.includes(",")) {
return '"' + str.replace(/"/g, '""') + '"';
@ -140,7 +140,7 @@ function is24h(arr) {
function buildHoraires(p) {
const days = formatDays(p.c_disp_j);
const hours = formatHours(p.c_disp_h);
const complt = (p.c_disp_complt || "").replace(/[\r\n]+/g, " ").trim();
const complt = (p.c_disp_complt || "").replace(/[\r\n\t]+/g, " ").trim();
if (!complt) {
// No complement: just days + hours
@ -174,15 +174,52 @@ function buildHoraires(p) {
function formatAddress(p) {
const parts = [];
const num = (p.c_adr_num || "").trim();
const street = (p.c_adr_voie || "").trim();
let num = (p.c_adr_num || "").trim();
let street = (p.c_adr_voie || "")
.split("\t")[0] // strip tab-separated cp/city embedded in the field
.split("|")[0] // strip pipe-separated cp/city embedded in the field
.trim();
// Drop invalid numbers: placeholders, decimals, letters, etc.
// Valid street numbers: digits with optional dash/slash/space separators (e.g. "62", "62-64", "10 12")
if (!/^\d[\d\s\-/]*$/.test(num)) num = "";
const cp = (p.c_com_cp || "").trim();
// Drop num when it equals the postal code (data entry mistake)
if (num && num === cp) num = "";
// Strip parenthesized cp from city name, e.g. "GANAC (09000)" → "GANAC"
let city = (p.c_com_nom || "").trim();
if (cp && city) {
city = city.replace(new RegExp("\\s*\\(" + cp.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\)"), "").trim();
}
// Strip cp+city already embedded in street field
// e.g. "Mont Salomon 38200 Vienne" or "62117 rue de Lambres" when cp matches
if (cp && street.includes(cp)) {
const cpEscaped = cp.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
// Trailing: "street 38200 Vienne" → "street"
street = street.replace(new RegExp("\\s+" + cpEscaped + "\\s+.*$"), "").trim();
// Leading: "62117 rue de Lambres" → "rue de Lambres"
street = street.replace(new RegExp("^" + cpEscaped + "\\s+"), "").trim();
}
if (num && street) {
parts.push(num + " " + street);
// Avoid duplicated number when street already starts with the same number
// Handles plain "62 Rue…", ranges "62-64 Rue…", and slashes "62/64 Rue…"
const numEscaped = num.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const alreadyHasNum = new RegExp("^" + numEscaped + "(?!\\d)").test(street);
if (alreadyHasNum) {
parts.push(street);
} else {
parts.push(num + " " + street);
}
} else if (street) {
parts.push(street);
} else if (num) {
parts.push(num);
}
const cp = (p.c_com_cp || "").trim();
const city = (p.c_com_nom || "").trim();
if (cp && city) {
parts.push(cp + " " + city);
} else if (city) {
@ -250,6 +287,83 @@ function passesFilter(p) {
return true;
}
/**
* Check if coordinates fall in a plausible French territory.
*/
function isPlausibleFrance(lat, lon) {
if (Math.abs(lat) > 90 || Math.abs(lon) > 180) return false;
// Metropolitan France
if (lat >= 41 && lat <= 52 && lon >= -6 && lon <= 11) return true;
// La Réunion
if (lat >= -22 && lat <= -20 && lon >= 54 && lon <= 57) return true;
// Mayotte
if (lat >= -14 && lat <= -12 && lon >= 44 && lon <= 46) return true;
// Guadeloupe / Martinique / Saint-Martin / Saint-Barthélemy
if (lat >= 14 && lat <= 18 && lon >= -64 && lon <= -60) return true;
// Guyane
if (lat >= 2 && lat <= 6 && lon >= -55 && lon <= -51) return true;
// Nouvelle-Calédonie
if (lat >= -23 && lat <= -19 && lon >= 163 && lon <= 169) return true;
// Polynésie française
if (lat >= -28 && lat <= -7 && lon >= -155 && lon <= -130) return true;
// Saint-Pierre-et-Miquelon
if (lat >= 46 && lat <= 48 && lon >= -57 && lon <= -55) return true;
// Wallis-et-Futuna
if (lat >= -15 && lat <= -13 && lon >= -179 && lon <= -176) return true;
// TAAF (Kerguelen, Crozet, Amsterdam, etc.)
if (lat >= -50 && lat <= -37 && lon >= 50 && lon <= 78) return true;
// Clipperton
if (lat >= 10 && lat <= 11 && lon >= -110 && lon <= -108) return true;
return false;
}
/**
* Try to fix an out-of-range coordinate by dividing by powers of 10.
* Returns the fixed value if it falls in [minValid, maxValid], else null.
*/
function tryNormalizeCoord(val, limit) {
if (Math.abs(val) <= limit) return val;
let v = val;
while (Math.abs(v) > limit) {
v /= 10;
}
return v;
}
/**
* Attempt to produce valid WGS84 coordinates from potentially garbled input.
* Strategy:
* 1. Use properties directly if valid
* 2. Fall back to GeoJSON geometry (standard [lon, lat] then swapped)
* 3. Try power-of-10 normalization for misplaced decimals
*/
function fixCoordinates(lat, lon, geometry) {
// 1. Already valid WGS84 — trust the source as-is
if (Math.abs(lat) <= 90 && Math.abs(lon) <= 180) return { lat, lon };
// Out of WGS84 range — try to recover using fallbacks + plausibility check
// 2. Try GeoJSON geometry
if (geometry && geometry.coordinates) {
let coords = geometry.coordinates;
// Flatten nested arrays (MultiPoint, etc.)
while (Array.isArray(coords[0])) coords = coords[0];
if (coords.length === 2) {
const [gLon, gLat] = coords; // GeoJSON = [lon, lat]
if (isPlausibleFrance(gLat, gLon)) return { lat: gLat, lon: gLon };
// Try swapped (some entries have lat/lon inverted in geometry)
if (isPlausibleFrance(gLon, gLat)) return { lat: gLon, lon: gLat };
}
}
// 3. Try power-of-10 normalization for misplaced decimals
const fixedLat = tryNormalizeCoord(lat, 90);
const fixedLon = tryNormalizeCoord(lon, 180);
if (isPlausibleFrance(fixedLat, fixedLon)) return { lat: fixedLat, lon: fixedLon };
return null;
}
// --- Main ---
console.log("Reading geodae.json...");
@ -281,13 +395,20 @@ for (const feature of features) {
continue;
}
const lat = p.c_lat_coor1;
const lon = p.c_long_coor1;
if (lat == null || lon == null) {
const rawLat = p.c_lat_coor1;
const rawLon = p.c_long_coor1;
if (rawLat == null || rawLon == null) {
filtered++;
continue;
}
const fixed = fixCoordinates(rawLat, rawLon, feature.geometry);
if (!fixed) {
filtered++;
continue;
}
const { lat, lon } = fixed;
const always = isAlwaysAvailable(p);
if (always) alwaysCount++;