From 88f18aa3be87e663efb41a5d656eb48eb347307b Mon Sep 17 00:00:00 2001 From: gaoflow Date: Tue, 2 Jun 2026 05:03:40 +0200 Subject: [PATCH] Parse read_nsrdb_psm4 header with csv module to keep quoted commas read_nsrdb_psm4 split the three header lines with a naive str.split(','), which broke spectral-on-demand files whose column names are quoted fields containing commas (e.g. '"GaAs (Bauhuis et al., 2009)"'). Such names were split into spurious columns, raising on read. Parse the header lines with the csv module so quoted fields are kept intact. Fixes #2736 --- docs/sphinx/source/whatsnew/v0.15.2.rst | 5 +++++ pvlib/iotools/psm4.py | 12 +++++++++--- tests/iotools/test_psm4.py | 25 +++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/docs/sphinx/source/whatsnew/v0.15.2.rst b/docs/sphinx/source/whatsnew/v0.15.2.rst index 35d7dc675c..51bec8822d 100644 --- a/docs/sphinx/source/whatsnew/v0.15.2.rst +++ b/docs/sphinx/source/whatsnew/v0.15.2.rst @@ -26,6 +26,10 @@ Bug fixes represent the end of the averaging interval, consistent with ERA5 conventions. (:issue:`2772`, :pull:`2773`) +* :py:func:`pvlib.iotools.read_nsrdb_psm4` now parses the file header with the + :py:mod:`csv` module instead of a naive ``str.split(',')``, so quoted column + names containing commas (e.g. the material names in spectral-on-demand files) + are no longer split into spurious columns. (:issue:`2736`, :pull:`2771`) Enhancements ~~~~~~~~~~~~ @@ -67,6 +71,7 @@ Maintenance Contributors ~~~~~~~~~~~~ * :ghuser:`Omesh37` +* :ghuser:`gaoflow` * Cliff Hansen (:ghuser:`cwhanse`) * :ghuser:`shethkajal7` * Arthur Onno (:ghuser:`ArthurOnnoTerabase`) diff --git a/pvlib/iotools/psm4.py b/pvlib/iotools/psm4.py index 9eb760f382..fc8d098a09 100644 --- a/pvlib/iotools/psm4.py +++ b/pvlib/iotools/psm4.py @@ -6,6 +6,7 @@ https://developer.nlr.gov/docs/solar/nsrdb/nsrdb-GOES-full-disc-v4-0-0-download/ """ +import csv import io from urllib.parse import urljoin import requests @@ -723,11 +724,16 @@ def read_nsrdb_psm4(filename, map_variables=True): `_ """ with tools._file_context_manager(filename) as fbuf: + # The first 3 header lines are parsed with the csv module rather than a + # naive str.split(',') so that quoted fields containing commas are kept + # intact. Spectral-on-demand files, for instance, have column names + # like '"GaAs (Bauhuis et al., 2009)"' whose embedded commas would + # otherwise be split into spurious columns (see GH #2736). # The first 2 lines of the response are headers with metadata - metadata_fields = fbuf.readline().split(',') - metadata_values = fbuf.readline().split(',') + metadata_fields = next(csv.reader([fbuf.readline()])) + metadata_values = next(csv.reader([fbuf.readline()])) # get the column names so we can set the dtypes - columns = fbuf.readline().split(',') + columns = next(csv.reader([fbuf.readline()])) columns[-1] = columns[-1].strip() # strip trailing newline # Since the header has so many columns, excel saves blank cols in the # data below the header lines. diff --git a/tests/iotools/test_psm4.py b/tests/iotools/test_psm4.py index 3b4313b070..c16a714aa3 100644 --- a/tests/iotools/test_psm4.py +++ b/tests/iotools/test_psm4.py @@ -185,6 +185,31 @@ def test_read_nsrdb_psm4_map_variables(): assert_index_equal(data.columns, pd.Index(columns_mapped)) +def test_read_nsrdb_psm4_quoted_columns_with_commas(): + """spectral-on-demand files have quoted column names containing commas; + these must not be split into spurious columns (GH #2736)""" + # Minimal NSRDB file whose column header (3rd line) has quoted material + # names with embedded commas, which is valid CSV. A naive str.split(',') + # would break these into extra columns and raise on read. + content = ( + "Source,Location ID,City,State,Country,Latitude,Longitude,Time Zone," + "Elevation,Local Time Zone,Version\n" + "NSRDB,1,-,-,-,40.0,-105.0,-7,1600,-7,4.0.1\n" + 'Year,Month,Day,Hour,Minute,GHI,"GaAs (Bauhuis et al., 2009)",' + '"InGaP (Gray, 2008)"\n' + "2023,1,1,0,0,0,0.1,0.2\n" + "2023,1,1,1,0,5,0.3,0.4\n" + ) + data, metadata = psm4.read_nsrdb_psm4(StringIO(content), + map_variables=False) + assert list(data.columns) == [ + 'Year', 'Month', 'Day', 'Hour', 'Minute', 'GHI', + 'GaAs (Bauhuis et al., 2009)', 'InGaP (Gray, 2008)'] + assert data.shape == (2, 8) + # the embedded-comma data columns round-trip as floats + assert data['GaAs (Bauhuis et al., 2009)'].tolist() == [0.1, 0.3] + + @pytest.mark.remote_data @pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY) def test_get_nsrdb_psm4_aggregated_parameter_mapping(nlr_api_key):