Ingestão dump_wikidata no PostgreSQL - datasets-br/city-codes GitHub Wiki
Exemplos:
-- lista arquivos JSON das cidades do Estado de São Paulo:
select pg_ls_dir('/opt/gits/city-codes-BR/data/dump_wikidata/SP');
-- recupera uma claim específica:
SELECT (x->'claims'->'P402'->0->>'value')::bigint AS osm_id
FROM (
select pg_read_file('/opt/gits/city-codes-BR/data/dump_wikidata/SP/Itabera.json',true)::jsonb
) t(x);
Função de varredura
CREATE or replace FUNCTION ingest.wikidatafolder_input_files(
p_fpath text DEFAULT '/opt/gits/city-codes-BR/data/dump_wikidata'
) RETURNS TABLE (
cityname text, fname text, fmeta jsonb, jcontent jsonb
) AS $f$
WITH t0 AS ( SELECT rtrim(p_fpath,'/') AS fpath )
, t1 AS (
SELECT f as statename,
t0.fpath ||'/'|| f as f
FROM pg_ls_dir((SELECT fpath FROM t0)) t(f), t0
), t3 AS (
SELECT t1.statename, t1.f as fpath, t2.f2 as fname
FROM t1, LATERAL pg_ls_dir(t1.f) t2(f2)
WHERE f2 ~ '\.json$'
)
SELECT statename, fname,
to_jsonb( pg_stat_file(f) ) || jsonb_build_object('fpath',fpath) fmeta,
pg_read_file(f,true)::jsonb as jcontent
FROM ( SELECT *, fpath ||'/'|| fname as f FROM t3 ) t4
$f$ language SQL immutable;
CREATE TABLE cache_wikidata_city AS
SELECT cityname, fname, jcontent->>'id' AS wikidata_id,
(jcontent->'claims'->'P402'->0->>'value')::bigint AS osm_id,
(jcontent->'claims'->'P1585'->0->>'value')::bigint AS local_id
FROM ingest.wikidatafolder_input_files()
;
-- confirmar no OSM com relation https://www.openstreetmap.org/relation/{osm_id}
-- e conferindo tags 'IBGE:GEOCODIGO' (mesmo que local_id), 'admin_level' (8 para municípios),
-- 'boundary' (administrative), 'type' (boundary) e wikidata (mesmo código com "Q").
-- CORRECAO DE BRASILIA (nao usar o código da UF (53) mas do município
update cache_wikidata_city set local_id=5300108 where wikidata_id='Q2844' and osm_id='2758138';
Regatando do modelo antigo
Na antiga base dl03t_main
:
COPY (
SELECT
COALESCE(w.osm_id::bigint,-76*10000000 -ibge_id::bigint) AS osm_id,
76 AS jurisd_base_id,
ibge_id AS jurisd_local_id,
c.name,
c.state AS parent_abbrev,
c.abbrev3 AS abbrev,
c.wikidata_id, c.lexlabel, c.isolabel_ext, c.ddd, c.info
FROM ingest.city c
LEFT JOIN cache_wikidata_city w ON 'Q'||c.wikidata_id=w.wikidata_id AND w.local_id::int=ibge_id
ORDER BY 2,3
) TO '/tmp/jurisdiction.csv' CSV HEADER
;