Files
voyage/backend/server/worldtravel/management/commands/download-countries.py

378 lines
19 KiB
Python

import os
from django.core.management.base import BaseCommand
import requests
from worldtravel.models import Country, Region, City
from django.db import transaction
from tqdm import tqdm
import ijson
import gc
from django.conf import settings
COUNTRY_REGION_JSON_VERSION = settings.COUNTRY_REGION_JSON_VERSION
media_root = settings.MEDIA_ROOT
def saveCountryFlag(country_code):
# For standards, use the lowercase country_code
country_code = country_code.lower()
flags_dir = os.path.join(media_root, 'flags')
# Check if the flags directory exists, if not, create it
if not os.path.exists(flags_dir):
os.makedirs(flags_dir)
# Check if the flag already exists in the media folder
flag_path = os.path.join(flags_dir, f'{country_code}.png')
if os.path.exists(flag_path):
print(f'Flag for {country_code} already exists')
return
res = requests.get(f'https://flagcdn.com/h240/{country_code}.png'.lower())
if res.status_code == 200:
with open(flag_path, 'wb') as f:
f.write(res.content)
print(f'Flag for {country_code} downloaded')
else:
print(f'Error downloading flag for {country_code}')
class Command(BaseCommand):
help = 'Imports the world travel data'
def add_arguments(self, parser):
parser.add_argument('--force', action='store_true', help='Force download the countries+regions+states.json file')
def handle(self, **options):
force = options['force']
batch_size = 500 # Increased batch size for better performance
countries_json_path = os.path.join(settings.MEDIA_ROOT, f'countries+regions+states-{COUNTRY_REGION_JSON_VERSION}.json')
if not os.path.exists(countries_json_path) or force:
res = requests.get(f'https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/{COUNTRY_REGION_JSON_VERSION}/json/countries%2Bstates%2Bcities.json')
if res.status_code == 200:
with open(countries_json_path, 'w') as f:
f.write(res.text)
self.stdout.write(self.style.SUCCESS('countries+regions+states.json downloaded successfully'))
else:
self.stdout.write(self.style.ERROR('Error downloading countries+regions+states.json'))
return
elif not os.path.isfile(countries_json_path):
self.stdout.write(self.style.ERROR('countries+regions+states.json is not a file'))
return
elif os.path.getsize(countries_json_path) == 0:
self.stdout.write(self.style.ERROR('countries+regions+states.json is empty'))
return
elif Country.objects.count() == 0 or Region.objects.count() == 0 or City.objects.count() == 0:
self.stdout.write(self.style.WARNING('Some region data is missing. Re-importing all data.'))
else:
self.stdout.write(self.style.SUCCESS('Latest country, region, and state data already downloaded.'))
return
# Use sets for faster lookup instead of dictionaries when we only need existence checks
self.stdout.write(self.style.SUCCESS('Loading existing data for comparison...'))
existing_country_codes = set(Country.objects.values_list('country_code', flat=True))
existing_region_ids = set(Region.objects.values_list('id', flat=True))
existing_city_ids = set(City.objects.values_list('id', flat=True))
self.stdout.write(self.style.SUCCESS(f'Found {len(existing_country_codes)} existing countries, {len(existing_region_ids)} regions, {len(existing_city_ids)} cities'))
# Only fetch full objects when we actually need to update them
existing_countries = {}
existing_regions = {}
existing_cities = {}
processed_country_codes = set()
processed_region_ids = set()
processed_city_ids = set()
# Process data in streaming fashion to avoid loading everything into memory
self.stdout.write(self.style.SUCCESS('Starting to process country data...'))
with open(countries_json_path, 'rb') as f:
parser = ijson.items(f, 'item')
countries_to_create = []
regions_to_create = []
cities_to_create = []
countries_to_update = []
regions_to_update = []
cities_to_update = []
country_count = 0
total_regions_processed = 0
total_cities_processed = 0
batch_number = 1
for country in parser:
country_count += 1
country_code = country['iso2']
country_name = country['name']
country_subregion = country['subregion']
country_capital = country['capital']
longitude = round(float(country['longitude']), 6) if country['longitude'] else None
latitude = round(float(country['latitude']), 6) if country['latitude'] else None
if country_count % 10 == 0:
self.stdout.write(f'Processing country {country_count}: {country_name} ({country_code})')
processed_country_codes.add(country_code)
if country_code in existing_country_codes:
# Only fetch when needed for updates
if country_code not in existing_countries:
existing_countries[country_code] = Country.objects.get(country_code=country_code)
country_obj = existing_countries[country_code]
country_obj.name = country_name
country_obj.subregion = country_subregion
country_obj.capital = country_capital
country_obj.longitude = longitude
country_obj.latitude = latitude
countries_to_update.append(country_obj)
else:
country_obj = Country(
name=country_name,
country_code=country_code,
subregion=country_subregion,
capital=country_capital,
longitude=longitude,
latitude=latitude
)
countries_to_create.append(country_obj)
saveCountryFlag(country_code)
# Process states/regions
region_count_for_country = 0
city_count_for_country = 0
if country['states']:
for state in country['states']:
name = state['name']
state_id = f"{country_code}-{state['state_code']}"
latitude = round(float(state['latitude']), 6) if state['latitude'] else None
longitude = round(float(state['longitude']), 6) if state['longitude'] else None
if state_id in processed_region_ids:
continue
processed_region_ids.add(state_id)
region_count_for_country += 1
total_regions_processed += 1
if state_id in existing_region_ids:
if state_id not in existing_regions:
existing_regions[state_id] = Region.objects.get(id=state_id)
region_obj = existing_regions[state_id]
region_obj.name = name
region_obj.country = country_obj
region_obj.longitude = longitude
region_obj.latitude = latitude
regions_to_update.append(region_obj)
else:
region_obj = Region(
id=state_id,
name=name,
country=country_obj,
longitude=longitude,
latitude=latitude
)
regions_to_create.append(region_obj)
# Process cities
if 'cities' in state and len(state['cities']) > 0:
for city in state['cities']:
city_id = f"{state_id}-{city['id']}"
city_name = city['name']
latitude = round(float(city['latitude']), 6) if city['latitude'] else None
longitude = round(float(city['longitude']), 6) if city['longitude'] else None
if city_id in processed_city_ids:
continue
processed_city_ids.add(city_id)
city_count_for_country += 1
total_cities_processed += 1
if city_id in existing_city_ids:
if city_id not in existing_cities:
existing_cities[city_id] = City.objects.get(id=city_id)
city_obj = existing_cities[city_id]
city_obj.name = city_name
city_obj.region = region_obj
city_obj.longitude = longitude
city_obj.latitude = latitude
cities_to_update.append(city_obj)
else:
city_obj = City(
id=city_id,
name=city_name,
region=region_obj,
longitude=longitude,
latitude=latitude
)
cities_to_create.append(city_obj)
else:
# Country without states - create a default region
state_id = f"{country_code}-00"
processed_region_ids.add(state_id)
region_count_for_country = 1
total_regions_processed += 1
if state_id in existing_region_ids:
if state_id not in existing_regions:
existing_regions[state_id] = Region.objects.get(id=state_id)
region_obj = existing_regions[state_id]
region_obj.name = country_name
region_obj.country = country_obj
regions_to_update.append(region_obj)
else:
region_obj = Region(
id=state_id,
name=country_name,
country=country_obj
)
regions_to_create.append(region_obj)
if country_count % 10 == 0:
self.stdout.write(f' └─ {country_name}: {region_count_for_country} regions, {city_count_for_country} cities')
# Process in batches during iteration to manage memory
if country_count % 50 == 0: # Process every 50 countries
self.stdout.write(self.style.WARNING(f'Processing batch {batch_number} (countries {country_count-49}-{country_count})...'))
self.stdout.write(f' Countries to create: {len(countries_to_create)}, to update: {len(countries_to_update)}')
self.stdout.write(f' Regions to create: {len(regions_to_create)}, to update: {len(regions_to_update)}')
self.stdout.write(f' Cities to create: {len(cities_to_create)}, to update: {len(cities_to_update)}')
self._process_batches(
countries_to_create, regions_to_create, cities_to_create,
countries_to_update, regions_to_update, cities_to_update,
batch_size
)
self.stdout.write(self.style.SUCCESS(f'✓ Batch {batch_number} completed successfully'))
# Clear processed batches and force garbage collection
countries_to_create.clear()
regions_to_create.clear()
cities_to_create.clear()
countries_to_update.clear()
regions_to_update.clear()
cities_to_update.clear()
# Clear the cached objects to free memory
existing_countries.clear()
existing_regions.clear()
existing_cities.clear()
gc.collect()
batch_number += 1
# Process remaining batches
if countries_to_create or regions_to_create or cities_to_create or \
countries_to_update or regions_to_update or cities_to_update:
self.stdout.write(self.style.WARNING(f'Processing final batch {batch_number} (remaining {len(countries_to_create + countries_to_update)} countries)...'))
self.stdout.write(f' Countries to create: {len(countries_to_create)}, to update: {len(countries_to_update)}')
self.stdout.write(f' Regions to create: {len(regions_to_create)}, to update: {len(regions_to_update)}')
self.stdout.write(f' Cities to create: {len(cities_to_create)}, to update: {len(cities_to_update)}')
self._process_batches(
countries_to_create, regions_to_create, cities_to_create,
countries_to_update, regions_to_update, cities_to_update,
batch_size
)
self.stdout.write(self.style.SUCCESS(f'✓ Final batch completed successfully'))
self.stdout.write(self.style.SUCCESS(f'Finished processing {country_count} countries, {total_regions_processed} regions, {total_cities_processed} cities'))
# Clean up obsolete records
self.stdout.write(self.style.WARNING('Cleaning up obsolete records...'))
with transaction.atomic():
countries_deleted = Country.objects.exclude(country_code__in=processed_country_codes).count()
regions_deleted = Region.objects.exclude(id__in=processed_region_ids).count()
cities_deleted = City.objects.exclude(id__in=processed_city_ids).count()
Country.objects.exclude(country_code__in=processed_country_codes).delete()
Region.objects.exclude(id__in=processed_region_ids).delete()
City.objects.exclude(id__in=processed_city_ids).delete()
if countries_deleted > 0 or regions_deleted > 0 or cities_deleted > 0:
self.stdout.write(f' Deleted {countries_deleted} obsolete countries, {regions_deleted} regions, {cities_deleted} cities')
else:
self.stdout.write(' No obsolete records found to delete')
self.stdout.write(self.style.SUCCESS('All data imported successfully'))
def _process_batches(self, countries_to_create, regions_to_create, cities_to_create,
countries_to_update, regions_to_update, cities_to_update, batch_size):
"""Process all pending batches in a single transaction, safely"""
with transaction.atomic():
# 1. Create new countries
if countries_to_create:
self.stdout.write(f' Creating {len(countries_to_create)} countries in batches of {batch_size}...')
for i in range(0, len(countries_to_create), batch_size):
batch = countries_to_create[i:i + batch_size]
Country.objects.bulk_create(batch, ignore_conflicts=True)
# 2. Re-fetch the now-saved countries from the DB
saved_country_map = {
c.country_code: c for c in Country.objects.filter(
country_code__in=[c.country_code for c in countries_to_create]
)
}
# 3. Rebind Region.country to actual saved Country objects
for region in regions_to_create:
if isinstance(region.country, Country):
region.country = saved_country_map.get(region.country.country_code)
# 4. Create new regions
if regions_to_create:
self.stdout.write(f' Creating {len(regions_to_create)} regions in batches of {batch_size}...')
for i in range(0, len(regions_to_create), batch_size):
batch = regions_to_create[i:i + batch_size]
Region.objects.bulk_create(batch, ignore_conflicts=True)
# 5. Re-fetch the now-saved regions from the DB
saved_region_map = {
r.id: r for r in Region.objects.filter(
id__in=[r.id for r in regions_to_create]
)
}
# 6. Rebind City.region to actual saved Region objects
for city in cities_to_create:
if isinstance(city.region, Region):
city.region = saved_region_map.get(city.region.id)
# 7. Create new cities
if cities_to_create:
self.stdout.write(f' Creating {len(cities_to_create)} cities in batches of {batch_size}...')
for i in range(0, len(cities_to_create), batch_size):
batch = cities_to_create[i:i + batch_size]
City.objects.bulk_create(batch, ignore_conflicts=True)
# 8. Update existing countries
if countries_to_update:
self.stdout.write(f' Updating {len(countries_to_update)} countries in batches of {batch_size}...')
for i in range(0, len(countries_to_update), batch_size):
batch = countries_to_update[i:i + batch_size]
Country.objects.bulk_update(batch, ['name', 'subregion', 'capital', 'longitude', 'latitude'])
# 9. Update existing regions
if regions_to_update:
self.stdout.write(f' Updating {len(regions_to_update)} regions in batches of {batch_size}...')
for i in range(0, len(regions_to_update), batch_size):
batch = regions_to_update[i:i + batch_size]
Region.objects.bulk_update(batch, ['name', 'country', 'longitude', 'latitude'])
# 10. Update existing cities
if cities_to_update:
self.stdout.write(f' Updating {len(cities_to_update)} cities in batches of {batch_size}...')
for i in range(0, len(cities_to_update), batch_size):
batch = cities_to_update[i:i + batch_size]
City.objects.bulk_update(batch, ['name', 'region', 'longitude', 'latitude'])