celestia/src/tools/celestia-gaia-stardb/make_stardb.py

373 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
# gaia-stardb: Processing Gaia DR2 for celestia.Sci/Celestia
# Copyright (C) 20192020 Andrew Tribick
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""Makes the star database."""
import contextlib
import gzip
import os
import struct
from zipfile import ZipFile, ZIP_DEFLATED
import numpy as np
import astropy.units as u
from astropy.table import MaskedColumn, Table, join, unique, vstack
from parse_hip import process_hip
from parse_tyc import process_tyc
from parse_utils import WorkaroundCDSReader, open_cds_tarfile
from spparse import CEL_UNKNOWN_STAR, parse_spectrum
VERSION = "1.0.4"
# remove the following objects from the output
EXCLUSIONS = [
60936, # quasar 3C 273
114110, # non-existent star (see HIP1 errata)
114176, # non-existent star (see HIP1 errata)
]
# temperatures from star.cpp, spectral types O3-M9
TEFF_SPEC = np.array([
52500, 48000, 44500, 41000, 38000, 35800, 33000,
30000, 25400, 22000, 18700, 17000, 15400, 14000, 13000, 11900, 10500,
9520, 9230, 8970, 8720, 8460, 8200, 8020, 7850, 7580, 7390,
7200, 7050, 6890, 6740, 6590, 6440, 6360, 6280, 6200, 6110,
6030, 5940, 5860, 5830, 5800, 5770, 5700, 5630, 5570, 5410,
5250, 5080, 4900, 4730, 4590, 4350, 4200, 4060, 3990, 3920,
3850, 3720, 3580, 3470, 3370, 3240, 3050, 2940, 2640, 2000])
TEFF_BINS = (TEFF_SPEC[:-1] + TEFF_SPEC[1:]) // 2
parse_spectrum_vec = np.vectorize(parse_spectrum, otypes=[np.uint16]) # pylint: disable=invalid-name
CEL_SPECS = parse_spectrum_vec(['OBAFGKM'[i//10]+str(i%10) for i in range(3, 70)])
def load_ubvri() -> Table:
"""Load UBVRI Teff calibration from VizieR archive."""
print('Loading UBVRI calibration')
with open_cds_tarfile(os.path.join('vizier', 'ubvriteff.tar.gz')) as tf:
return tf.read_gzip('table3.dat', ['V-K', 'B-V', 'V-I', 'J-K', 'H-K', 'Teff'])
def parse_spectra(data: Table) -> Table:
"""Parse the spectral types into the celestia.Sci format."""
print('Parsing spectral types')
data['SpType'] = data['SpType'].filled('')
sptypes = unique(data['SpType',])
sptypes['CelSpec'] = parse_spectrum_vec(sptypes['SpType'])
return join(data, sptypes)
def estimate_magnitudes(data: Table) -> None:
"""Estimates magnitudes and color indices from G magnitude and BP-RP.
Formula used is from Evans et al. (2018) "Gaia Data Release 2: Photometric
content and validation".
"""
print("Computing missing magnitudes and color indices")
bp_rp = data['bp_rp'].filled(0)
bp_rp2 = bp_rp**2
data['Vmag'] = MaskedColumn(
data['Vmag'].filled(
data['phot_g_mean_mag'].filled(np.nan) + 0.01760 + bp_rp*0.006860 + bp_rp2*0.1732))
data['e_Vmag'] = MaskedColumn(data['e_Vmag'].filled(0.045858))
data['Vmag'].mask = np.isnan(data['Vmag'])
data['e_Vmag'].mask = data['Vmag'].mask
bp_rp = data['bp_rp'].filled(np.nan)
bp_rp2 = bp_rp**2
imag = data['phot_g_mean_mag'].filled(np.nan) - 0.02085 - bp_rp*0.7419 + bp_rp2*0.09631
e_imag = np.where(np.isnan(imag), np.nan, 0.04956)
f_bmag = data['Bmag'].filled(np.nan)
f_vmag = data['Vmag'].filled(np.nan)
f_jmag = data['Jmag'].filled(np.nan)
f_hmag = data['Hmag'].filled(np.nan)
f_kmag = data['Kmag'].filled(np.nan)
f_e_bmag = data['e_Bmag'].filled(np.nan)
f_e_vmag = data['e_Vmag'].filled(np.nan)
f_e_jmag = data['e_Jmag'].filled(np.nan)
f_e_hmag = data['e_Hmag'].filled(np.nan)
f_e_kmag = data['e_Kmag'].filled(np.nan)
data['B-V'] = MaskedColumn(data['B-V'].filled(f_bmag - f_vmag))
data['e_B-V'] = MaskedColumn(data['e_B-V'].filled(np.sqrt(f_e_bmag**2 + f_e_vmag**2)))
data['V-I'] = MaskedColumn(data['V-I'].filled(f_vmag - imag))
data['e_V-I'] = MaskedColumn(data['e_V-I'].filled(np.sqrt(f_e_vmag**2 + e_imag**2)))
data['V-K'] = MaskedColumn(f_vmag - f_kmag)
data['e_V-K'] = MaskedColumn(np.sqrt(f_e_vmag**2 + f_e_kmag**2))
data['J-K'] = MaskedColumn(f_jmag - f_kmag)
data['e_J-K'] = MaskedColumn(np.sqrt(f_e_jmag**2 + f_e_kmag**2))
data['H-K'] = MaskedColumn(f_hmag - f_kmag)
data['e_H-K'] = MaskedColumn(np.sqrt(f_e_hmag**2 + f_e_kmag**2))
data['B-V'].mask = np.logical_or(data['B-V'].mask, np.isnan(data['B-V']))
data['e_B-V'].mask = np.logical_or(data['e_B-V'].mask, np.isnan(data['e_B-V']))
data['V-I'].mask = np.logical_or(data['V-I'].mask, np.isnan(data['V-I']))
data['e_V-I'].mask = np.logical_or(data['e_V-I'].mask, np.isnan(data['e_V-I']))
data['V-K'].mask = np.isnan(data['V-K'])
data['e_V-K'].mask = np.isnan(data['e_V-K'])
data['J-K'].mask = np.isnan(data['J-K'])
data['e_J-K'].mask = np.isnan(data['e_J-K'])
data['H-K'].mask = np.isnan(data['H-K'])
data['e_H-K'].mask = np.isnan(data['e_H-K'])
data.remove_columns(['Bmag', 'e_Bmag', 'e_Vmag', 'Jmag', 'e_Jmag', 'Hmag', 'e_Hmag',
'Kmag', 'e_Kmag'])
def estimate_temperatures(data: Table) -> None:
"""Estimate the temperature of stars."""
ubvri_data = load_ubvri()
print('Estimating temperatures from color indices')
indices = Table(
[
data['B-V'].filled(np.nan),
data['V-I'].filled(np.nan),
data['V-K'].filled(np.nan),
data['J-K'].filled(np.nan),
data['H-K'].filled(np.nan),
np.maximum(data['e_B-V'].filled(np.nan), 0.001),
np.maximum(data['e_V-I'].filled(np.nan), 0.001),
np.maximum(data['e_V-K'].filled(np.nan), 0.001),
np.maximum(data['e_J-K'].filled(np.nan), 0.001),
np.maximum(data['e_H-K'].filled(np.nan), 0.001),
],
names=['B-V','V-I','V-K','J-K','H-K',
'e_B-V','e_V-I','e_V-K','e_J-K','e_H-K'])
weights = np.full_like(data['HIP'], 0, dtype=np.float64)
teffs = np.full_like(data['HIP'], 0, dtype=np.float64)
for row in ubvri_data:
sumsq = np.maximum(
np.nan_to_num(((indices['B-V']-row['B-V'])/indices['e_B-V'])**2)
+ np.nan_to_num(((indices['V-K']-row['V-K'])/indices['e_V-K'])**2)
+ np.nan_to_num(((indices['J-K']-row['J-K'])/indices['e_J-K'])**2)
+ np.nan_to_num(((indices['V-I']-row['V-I'])/indices['e_V-I'])**2)
+ np.nan_to_num(((indices['H-K']-row['H-K'])/indices['e_H-K'])**2), 0.001)
teffs += row['Teff'] / sumsq
weights += 1.0 / sumsq
data['teff_est'] = teffs / weights
data['teff_est'].unit = u.K
def estimate_spectra(data: Table) -> Table:
"""Estimate the spectral type of stars."""
no_teff = data[data['teff_val'].mask]
# temporarily disable no-member error in pylint, as it cannot see the reduce method
# pylint: disable=no-member
has_indices = np.logical_and.reduce((no_teff['B-V'].mask,
no_teff['V-I'].mask,
no_teff['V-K'].mask,
no_teff['J-K'].mask,
no_teff['H-K'].mask))
# pylint: enable=no-member
no_teff = no_teff[np.logical_not(has_indices)]
estimate_temperatures(no_teff)
data = join(data,
no_teff['HIP', 'teff_est'],
keys=['HIP'],
join_type='left')
data['teff_val'] = data['teff_val'].filled(data['teff_est'].filled(np.nan))
data = data[np.logical_not(np.isnan(data['teff_val']))]
data['CelSpec'] = CEL_SPECS[np.digitize(data['teff_val'], TEFF_BINS)]
return data
def load_sao() -> Table:
"""Loads the SAO catalog."""
print("Loading SAO")
with open(os.path.join('vizier', 'sao.readme'), 'r') as readme:
reader = WorkaroundCDSReader('sao.dat', ['SAO', 'HD'], [np.int64, np.int64], readme)
with gzip.open(os.path.join('vizier', 'sao.dat.gz'), 'rt', encoding='ascii') as f:
data = reader.read(f)
data = unique(data.group_by('SAO'), keys=['HD'])
data = unique(data.group_by('HD'), keys=['SAO'])
return data
def merge_all() -> Table:
"""Merges the HIP and TYC data."""
hip_data = process_hip()
# extract the non-Gaia sources to make the merging easier
non_gaia = hip_data[hip_data['source_id'].mask]
# merge object data for objects in both catalogues
hip_data = join(hip_data[np.logical_not(hip_data['source_id'].mask)],
process_tyc(),
keys=['source_id'],
table_names=['hip', 'tyc'],
join_type='outer')
# Mask blank spectral type and component identifiers
for str_col in (c for c in hip_data.colnames if hip_data[c].dtype.kind == 'U'):
hip_data[str_col].mask = np.logical_or(hip_data[str_col].mask, hip_data[str_col] == '')
prefer_tyc = {'HD', 'SAO', 'Comp'}
for base_col in (c[:-4] for c in hip_data.colnames if c.endswith('_hip')):
hip_col = base_col + '_hip'
tyc_col = base_col + '_tyc'
hip_data.rename_column(hip_col, base_col)
if isinstance(hip_data[base_col], MaskedColumn):
mask = np.logical_and(hip_data[base_col].mask, hip_data[tyc_col].mask)
if base_col in prefer_tyc:
base_data = hip_data[tyc_col].filled(hip_data[base_col])
else:
base_data = hip_data[base_col].filled(hip_data[tyc_col])
hip_data[base_col] = MaskedColumn(base_data, mask=mask)
hip_data.remove_column(tyc_col)
hip_data['HIP'] = hip_data['HIP'].filled(hip_data['TYC'])
hip_data.remove_columns('TYC')
# Add the non-Gaia stars back into the dataset
hip_data = vstack([hip_data, non_gaia], join_type='outer', metadata_conflicts='silent')
# Merge SAO, preferring the values from the SAO catalogue
sao = load_sao()
sao = sao[np.isin(sao['HD'], hip_data[np.logical_not(hip_data['HD'].mask)]['HD'])]
hip_data['SAO'].mask = np.logical_or(hip_data['SAO'].mask,
np.isin(hip_data['SAO'], sao['SAO']))
hd_sao = join(hip_data[np.logical_not(hip_data['HD'].mask)],
sao,
keys=['HD'],
table_names=['xref', 'sao'],
join_type='left')
hd_sao.rename_column('SAO_xref', 'SAO')
hd_sao['SAO'] = MaskedColumn(hd_sao['SAO_sao'].filled(hd_sao['SAO']),
mask=np.logical_and(hd_sao['SAO'].mask,
hd_sao['SAO_sao'].mask))
hd_sao.remove_column('SAO_sao')
return vstack([hip_data[hip_data['HD'].mask], hd_sao], join_type='exact')
OBLIQUITY = np.radians(23.4392911)
COS_OBLIQUITY = np.cos(OBLIQUITY)
SIN_OBLIQUITY = np.sin(OBLIQUITY)
ROT_MATRIX = np.array([[1, 0, 0],
[0, COS_OBLIQUITY, SIN_OBLIQUITY],
[0, -SIN_OBLIQUITY, COS_OBLIQUITY]])
def process_data() -> Table:
"""Processes the missing data values."""
data = merge_all()
data = data[np.logical_not(data['dist_use'].mask)]
data = data[np.isin(data['HIP'], EXCLUSIONS, invert=True)]
estimate_magnitudes(data)
data = parse_spectra(data)
unknown_spectra = data[data['CelSpec'] == CEL_UNKNOWN_STAR]['HIP', 'teff_val', 'B-V', 'e_B-V',
'V-I', 'e_V-I', 'V-K', 'e_V-K',
'J-K', 'e_J-K', 'H-K', 'e_H-K']
unknown_spectra = estimate_spectra(unknown_spectra)
data = join(data,
unknown_spectra['HIP', 'CelSpec'],
keys=['HIP'],
join_type='left',
table_names=['data', 'est'])
data['CelSpec'] = np.where(data['CelSpec_data'] == CEL_UNKNOWN_STAR,
data['CelSpec_est'].filled(CEL_UNKNOWN_STAR),
data['CelSpec_data'])
data.remove_columns(['phot_g_mean_mag', 'bp_rp', 'teff_val', 'SpType', 'B-V', 'e_B-V', 'V-I',
'e_V-I', 'V-K', 'e_V-K', 'J-K', 'e_J-K', 'H-K', 'e_H-K', 'CelSpec_est',
'CelSpec_data'])
data['Vmag_abs'] = data['Vmag'] - 5*(np.log10(data['dist_use'])-1)
print('Converting coordinates to ecliptic frame')
data['ra'].convert_unit_to(u.rad)
data['dec'].convert_unit_to(u.rad)
data['dist_use'].convert_unit_to(u.lyr)
coords = np.matmul(ROT_MATRIX,
np.array([data['dist_use']*np.cos(data['ra'])*np.cos(data['dec']),
data['dist_use']*np.sin(data['dec']),
-data['dist_use']*np.sin(data['ra'])*np.cos(data['dec'])]))
data['x'] = coords[0]
data['y'] = coords[1]
data['z'] = coords[2]
data['x'].unit = u.lyr
data['y'].unit = u.lyr
data['z'].unit = u.lyr
return data
def write_starsdat(data: Table, outfile: str) -> None:
"""Write the stars.dat file."""
print('Writing stars.dat')
with open(outfile, 'wb') as f:
f.write(struct.pack('<8sHL', b'CELSTARS', 0x0100, len(data)))
print(f' Writing {len(data)} records')
fmt = struct.Struct('<L3fhH')
for hip, x, y, z, vmag_abs, celspec in zip(data['HIP'], data['x'], data['y'], data['z'],
data['Vmag_abs'], data['CelSpec']):
f.write(fmt.pack(hip, x, y, z, int(round(vmag_abs*256)), celspec))
def write_xindex(data: Table, field: str, outfile: str) -> None:
"""Write a cross-index file."""
print('Writing '+field+' cross-index')
print(' Extracting cross-index data')
data = data[np.logical_not(data[field].mask)]['HIP', 'Comp', field]
data['Comp'] = data['Comp'].filled('')
data = unique(data.group_by([field, 'Comp', 'HIP']), keys=[field])
print(f' Writing {len(data)} records')
with open(outfile, 'wb') as f:
f.write(struct.pack('<8sH', b'CELINDEX', 0x0100))
fmt = struct.Struct('<2L')
for hip, cat in zip(data['HIP'], data[field]):
f.write(fmt.pack(cat, hip))
def make_stardb() -> None:
"""Make the Celestia star database files."""
data = process_data()
with contextlib.suppress(FileExistsError):
os.mkdir('output')
write_starsdat(data, os.path.join('output', 'stars.dat'))
xindices = [
('HD', 'hdxindex.dat'),
('SAO', 'saoxindex.dat')
]
for fieldname, outfile in xindices:
write_xindex(data, fieldname, os.path.join('output', outfile))
print("Creating archive")
archivename = f'celestia-gaia-stardb-{VERSION}'
with ZipFile(f'{archivename}.zip', 'w', compression=ZIP_DEFLATED, compresslevel=9) as zf:
contents = ['stars.dat', 'hdxindex.dat', 'saoxindex.dat', 'LICENSE.txt', 'CREDITS.md']
for f in contents:
zf.write(os.path.join('output', f), arcname=os.path.join(archivename, f))
if __name__ == '__main__':
make_stardb()