Jump to content

User:Commander Keane/wikidump2mapdata.py

From Wikipedia, the free encyclopedia
"""
wikidump2mapdata.py

A Python script to create a Wikimedia Commons Data: file for mapframe display from
a wikitext list of UK grid references (with bold headings in the list).
Markers have blank titles and descriptions with the heading and grid reference.
Input: .txt file
Output: commons_data.json
"""

import re
import OSGridConverter

output_filename = "commons_data.json"
wiki_dump_filename = "grid-refs-from-wiki.txt"
decimal_places = 6  # Number of decimal places for lat/lon

grid_ref_regex = r'^.*([A-Z]{2} [0-9]{4}).*$'  # Captures format "SK 1234"
subheading_regex = r"^.*'''(.*)'''.*$" # Captures within triple quotes

subheading = 'default_subheading'
grid_refs_from_wiki = []
data = {}

def uk_grid_to_coords(grid_ref):
    """
    Convert UK grid reference (e.g., "SK 0805") to decimal coordinates (lat, lon).
    
    Args:
        grid_ref: String in format "XX 0805" where XX is two letters and 0805 is 4 digits
        
    Returns:
        tuple: (latitude, longitude) in decimal degrees (WGS84)
    """
    # OSGridConverter expects format like "SK08050500" (10 figures)
    # Convert 4-figure "SK 0805" to 10-figure format
    grid_ref = grid_ref.replace(" ", "").upper()
    letters = grid_ref[:2]
    easting = grid_ref[2:4]
    northing = grid_ref[4:6]
    
    # Pad to 3 figures each
    full_grid_ref = f"{letters}{easting}0{northing}0"
    
    # Convert using OSGridConverter
    coords = OSGridConverter.grid2latlong(full_grid_ref)
    lat = round(coords.latitude, decimal_places)
    lon = round(coords.longitude, decimal_places)
    return lat, lon

with open(wiki_dump_filename, 'r') as f:
    for line in f:
        match_subheading = re.match(subheading_regex, line.strip())
        if match_subheading:
            subheading = match_subheading.group(1)
            data[subheading] = []

        match_grid_ref = re.match(grid_ref_regex, line.strip())
        if match_grid_ref:
            grid_ref = match_grid_ref.group(1)
            lat, lon = uk_grid_to_coords(grid_ref)
            data[subheading].append([grid_ref, lat, lon])

# Create GeoJSON output for Wikimedia Commons
commons_output = []
output_start = '''{
    "type": "FeatureCollection",
    "features": ['''

output_end = '''    ]
}'''

commons_output.append(output_start)
for heading in data:
    for item in data[heading]:

        grid_ref = item[0]
        lat = item[1]
        lon = item[2]

        color = "#555555"
        item_output = f'''        {{
            "type": "Feature",
            "properties": {{ "marker-color": "{color}", "title":"", "description":"{heading}, grid reference: {grid_ref}","marker-size": "small"}},
            "geometry": {{
                "type": "Point",
                "coordinates": [{lon}, {lat}]
            }}
        }},'''
        commons_output.append(item_output)

commons_output[-1] = commons_output[-1].rstrip(',')  # Remove trailing comma from last item
commons_output.append(output_end)

with open(output_filename, 'w') as f:
    for line in commons_output:
        f.write(f"{line}\n")