fix(dae): csv address duplication + data normalizaition issues
This commit is contained in:
parent
609ddb47a9
commit
4091f3a44f
1 changed files with 132 additions and 11 deletions
|
|
@ -14,9 +14,9 @@ const OUTPUT = join(__dirname, "geodae.csv");
|
|||
|
||||
function escapeCsv(value) {
|
||||
if (value == null) return "";
|
||||
// Replace newlines with spaces to keep one row per entry
|
||||
// Replace newlines and tabs with spaces to keep one row per entry
|
||||
const str = String(value)
|
||||
.replace(/[\r\n]+/g, " ")
|
||||
.replace(/[\r\n\t]+/g, " ")
|
||||
.trim();
|
||||
if (str.includes('"') || str.includes(",")) {
|
||||
return '"' + str.replace(/"/g, '""') + '"';
|
||||
|
|
@ -140,7 +140,7 @@ function is24h(arr) {
|
|||
function buildHoraires(p) {
|
||||
const days = formatDays(p.c_disp_j);
|
||||
const hours = formatHours(p.c_disp_h);
|
||||
const complt = (p.c_disp_complt || "").replace(/[\r\n]+/g, " ").trim();
|
||||
const complt = (p.c_disp_complt || "").replace(/[\r\n\t]+/g, " ").trim();
|
||||
|
||||
if (!complt) {
|
||||
// No complement: just days + hours
|
||||
|
|
@ -174,15 +174,52 @@ function buildHoraires(p) {
|
|||
|
||||
function formatAddress(p) {
|
||||
const parts = [];
|
||||
const num = (p.c_adr_num || "").trim();
|
||||
const street = (p.c_adr_voie || "").trim();
|
||||
let num = (p.c_adr_num || "").trim();
|
||||
let street = (p.c_adr_voie || "")
|
||||
.split("\t")[0] // strip tab-separated cp/city embedded in the field
|
||||
.split("|")[0] // strip pipe-separated cp/city embedded in the field
|
||||
.trim();
|
||||
|
||||
// Drop invalid numbers: placeholders, decimals, letters, etc.
|
||||
// Valid street numbers: digits with optional dash/slash/space separators (e.g. "62", "62-64", "10 12")
|
||||
if (!/^\d[\d\s\-/]*$/.test(num)) num = "";
|
||||
|
||||
const cp = (p.c_com_cp || "").trim();
|
||||
|
||||
// Drop num when it equals the postal code (data entry mistake)
|
||||
if (num && num === cp) num = "";
|
||||
// Strip parenthesized cp from city name, e.g. "GANAC (09000)" → "GANAC"
|
||||
let city = (p.c_com_nom || "").trim();
|
||||
if (cp && city) {
|
||||
city = city.replace(new RegExp("\\s*\\(" + cp.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "\\)"), "").trim();
|
||||
}
|
||||
|
||||
// Strip cp+city already embedded in street field
|
||||
// e.g. "Mont Salomon 38200 Vienne" or "62117 rue de Lambres" when cp matches
|
||||
if (cp && street.includes(cp)) {
|
||||
const cpEscaped = cp.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
// Trailing: "street 38200 Vienne" → "street"
|
||||
street = street.replace(new RegExp("\\s+" + cpEscaped + "\\s+.*$"), "").trim();
|
||||
// Leading: "62117 rue de Lambres" → "rue de Lambres"
|
||||
street = street.replace(new RegExp("^" + cpEscaped + "\\s+"), "").trim();
|
||||
}
|
||||
|
||||
if (num && street) {
|
||||
// Avoid duplicated number when street already starts with the same number
|
||||
// Handles plain "62 Rue…", ranges "62-64 Rue…", and slashes "62/64 Rue…"
|
||||
const numEscaped = num.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
const alreadyHasNum = new RegExp("^" + numEscaped + "(?!\\d)").test(street);
|
||||
if (alreadyHasNum) {
|
||||
parts.push(street);
|
||||
} else {
|
||||
parts.push(num + " " + street);
|
||||
}
|
||||
} else if (street) {
|
||||
parts.push(street);
|
||||
} else if (num) {
|
||||
parts.push(num);
|
||||
}
|
||||
const cp = (p.c_com_cp || "").trim();
|
||||
const city = (p.c_com_nom || "").trim();
|
||||
|
||||
if (cp && city) {
|
||||
parts.push(cp + " " + city);
|
||||
} else if (city) {
|
||||
|
|
@ -250,6 +287,83 @@ function passesFilter(p) {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if coordinates fall in a plausible French territory.
|
||||
*/
|
||||
function isPlausibleFrance(lat, lon) {
|
||||
if (Math.abs(lat) > 90 || Math.abs(lon) > 180) return false;
|
||||
// Metropolitan France
|
||||
if (lat >= 41 && lat <= 52 && lon >= -6 && lon <= 11) return true;
|
||||
// La Réunion
|
||||
if (lat >= -22 && lat <= -20 && lon >= 54 && lon <= 57) return true;
|
||||
// Mayotte
|
||||
if (lat >= -14 && lat <= -12 && lon >= 44 && lon <= 46) return true;
|
||||
// Guadeloupe / Martinique / Saint-Martin / Saint-Barthélemy
|
||||
if (lat >= 14 && lat <= 18 && lon >= -64 && lon <= -60) return true;
|
||||
// Guyane
|
||||
if (lat >= 2 && lat <= 6 && lon >= -55 && lon <= -51) return true;
|
||||
// Nouvelle-Calédonie
|
||||
if (lat >= -23 && lat <= -19 && lon >= 163 && lon <= 169) return true;
|
||||
// Polynésie française
|
||||
if (lat >= -28 && lat <= -7 && lon >= -155 && lon <= -130) return true;
|
||||
// Saint-Pierre-et-Miquelon
|
||||
if (lat >= 46 && lat <= 48 && lon >= -57 && lon <= -55) return true;
|
||||
// Wallis-et-Futuna
|
||||
if (lat >= -15 && lat <= -13 && lon >= -179 && lon <= -176) return true;
|
||||
// TAAF (Kerguelen, Crozet, Amsterdam, etc.)
|
||||
if (lat >= -50 && lat <= -37 && lon >= 50 && lon <= 78) return true;
|
||||
// Clipperton
|
||||
if (lat >= 10 && lat <= 11 && lon >= -110 && lon <= -108) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to fix an out-of-range coordinate by dividing by powers of 10.
|
||||
* Returns the fixed value if it falls in [minValid, maxValid], else null.
|
||||
*/
|
||||
function tryNormalizeCoord(val, limit) {
|
||||
if (Math.abs(val) <= limit) return val;
|
||||
let v = val;
|
||||
while (Math.abs(v) > limit) {
|
||||
v /= 10;
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to produce valid WGS84 coordinates from potentially garbled input.
|
||||
* Strategy:
|
||||
* 1. Use properties directly if valid
|
||||
* 2. Fall back to GeoJSON geometry (standard [lon, lat] then swapped)
|
||||
* 3. Try power-of-10 normalization for misplaced decimals
|
||||
*/
|
||||
function fixCoordinates(lat, lon, geometry) {
|
||||
// 1. Already valid WGS84 — trust the source as-is
|
||||
if (Math.abs(lat) <= 90 && Math.abs(lon) <= 180) return { lat, lon };
|
||||
|
||||
// Out of WGS84 range — try to recover using fallbacks + plausibility check
|
||||
|
||||
// 2. Try GeoJSON geometry
|
||||
if (geometry && geometry.coordinates) {
|
||||
let coords = geometry.coordinates;
|
||||
// Flatten nested arrays (MultiPoint, etc.)
|
||||
while (Array.isArray(coords[0])) coords = coords[0];
|
||||
if (coords.length === 2) {
|
||||
const [gLon, gLat] = coords; // GeoJSON = [lon, lat]
|
||||
if (isPlausibleFrance(gLat, gLon)) return { lat: gLat, lon: gLon };
|
||||
// Try swapped (some entries have lat/lon inverted in geometry)
|
||||
if (isPlausibleFrance(gLon, gLat)) return { lat: gLon, lon: gLat };
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Try power-of-10 normalization for misplaced decimals
|
||||
const fixedLat = tryNormalizeCoord(lat, 90);
|
||||
const fixedLon = tryNormalizeCoord(lon, 180);
|
||||
if (isPlausibleFrance(fixedLat, fixedLon)) return { lat: fixedLat, lon: fixedLon };
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// --- Main ---
|
||||
|
||||
console.log("Reading geodae.json...");
|
||||
|
|
@ -281,13 +395,20 @@ for (const feature of features) {
|
|||
continue;
|
||||
}
|
||||
|
||||
const lat = p.c_lat_coor1;
|
||||
const lon = p.c_long_coor1;
|
||||
if (lat == null || lon == null) {
|
||||
const rawLat = p.c_lat_coor1;
|
||||
const rawLon = p.c_long_coor1;
|
||||
if (rawLat == null || rawLon == null) {
|
||||
filtered++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const fixed = fixCoordinates(rawLat, rawLon, feature.geometry);
|
||||
if (!fixed) {
|
||||
filtered++;
|
||||
continue;
|
||||
}
|
||||
const { lat, lon } = fixed;
|
||||
|
||||
const always = isAlwaysAvailable(p);
|
||||
if (always) alwaysCount++;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue