-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Add read_bsrn function #1145
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add read_bsrn function #1145
Changes from 30 commits
73e6f9e
31203b3
7a437c3
93b92c8
b0a6ddf
c6447a3
52a952a
870095a
444e979
f11288e
93deb2f
2fc95ad
656bbda
c10d75b
d16d935
86cfb17
2eb3d44
3218ab5
ad8d45a
fe632b8
db1ac24
a4c1d6f
fc6f56d
d7a5af8
17d9b0b
6ab294f
5f59024
66209e4
32a7cfb
536f53c
63dd3ac
6924183
5e5f9d5
5cf3d30
71895d1
b5ed6ee
23d4525
8dae943
23c3455
260b68a
6b52a72
7c12848
17206df
9466626
4d3a21c
4db55be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
"""Functions to read data from the Baseline Surface Radiation Network (BSRN). | ||
.. codeauthor:: Adam R. Jensen<adam-r-j@hotmail.com> | ||
""" | ||
|
||
import pandas as pd | ||
import gzip | ||
|
||
COL_SPECS = [(0, 3), (4, 9), (10, 16), (16, 22), (22, 27), (27, 32), (32, 39), | ||
(39, 45), (45, 50), (50, 55), (55, 64), (64, 70), (70, 75)] | ||
|
||
BSRN_COLUMNS = ['day', 'minute', | ||
'ghi', 'ghi_std', 'ghi_min', 'ghi_max', | ||
'dni', 'dni_std', 'dni_min', 'dni_max', | ||
'empty', 'empty', 'empty', 'empty', 'empty', | ||
'dhi', 'dhi_std', 'dhi_min', 'dhi_max', | ||
'lwd', 'lwd_std', 'lwd_min', 'lwd_max', | ||
'temp_air', 'relative_humidity', 'pressure'] | ||
|
||
|
||
def read_bsrn(filename): | ||
""" | ||
Read a BSRN station-to-archive file into a DataFrame. | ||
|
||
The BSRN (Baseline Surface Radiation Network) is a world wide network | ||
of high-quality solar radiation monitoring stations as described in [1]_. | ||
The function only parses the basic measurements (LR0100), which include | ||
global, diffuse, direct and downwelling long-wave radiation [2]_. Future | ||
updates may include parsing of additional data and meta-data. | ||
|
||
BSRN files are freely avaiable and can be accessed via FTP [3]_. Required | ||
username and password are easily obtainable as described in the BSRN's | ||
Data Release Guidelines [4]_. | ||
|
||
|
||
|
||
Parameters | ||
---------- | ||
filename: str | ||
A relative or absolute file path. | ||
|
||
Returns | ||
------- | ||
data: Dataframe | ||
A Dataframe with the columns as described below. For more extensive | ||
description of the variables, consult [2]_. | ||
|
||
Notes | ||
----- | ||
The data Dataframe includes the following fields: | ||
|
||
======================= ====== ========================================== | ||
Key Format Description | ||
======================= ====== ========================================== | ||
day int Day of the month 1-31 | ||
minute int Minute of the day 0-1439 | ||
ghi float Mean global horizontal irradiance [W/m^2] | ||
ghi_std float Std. global horizontal irradiance [W/m^2] | ||
ghi_min float Min. global horizontal irradiance [W/m^2] | ||
ghi_max float Max. global horizontal irradiance [W/m^2] | ||
dni float Mean direct normal irradiance [W/m^2] | ||
dni_std float Std. direct normal irradiance [W/m^2] | ||
dni_min float Min. direct normal irradiance [W/m^2] | ||
dni_max float Max. direct normal irradiance [W/m^2] | ||
dhi float Mean diffuse horizontal irradiance [W/m^2] | ||
dhi_std float Std. diffuse horizontal irradiance [W/m^2] | ||
dhi_min float Min. diffuse horizontal irradiance [W/m^2] | ||
dhi_max float Max. diffuse horizontal irradiance [W/m^2] | ||
lwd float Mean. downward long-wave radiation [W/m^2] | ||
lwd_std float Std. downward long-wave radiation [W/m^2] | ||
lwd_min float Min. downward long-wave radiation [W/m^2] | ||
lwd_max float Max. downward long-wave radiation [W/m^2] | ||
temp_air float Air temperature [°C] | ||
relative_humidity float Relative humidity [%] | ||
pressure float Atmospheric pressure [hPa] | ||
======================= ====== ========================================== | ||
|
||
References | ||
---------- | ||
.. [1] `World Radiation Monitoring Center - Baseline Surface Radiation | ||
Network (BSRN) <https:/https://bsrn.awi.de/>`_ | ||
.. [2] `Update of the Technical Plan for BSRN Data Management, 2013, | ||
Global Climate Observing System (GCOS) GCOS-172. | ||
<https://bsrn.awi.de/fileadmin/user_upload/bsrn.awi.de/Publications/gcos-174.pdf>`_ | ||
.. [3] `BSRN Data Retrieval via FTP | ||
<https://bsrn.awi.de/data/data-retrieval-via-ftp/>`_ | ||
.. [4] `BSRN Data Release Guidelines | ||
<https://bsrn.awi.de/data/conditions-of-data-release/>`_ | ||
""" | ||
|
||
# Read file and store the starting line number for each logical record (LR) | ||
line_no_dict = {} | ||
if str(filename).endswith('.gz'): # check if file is a gzipped (.gz) file | ||
open_func, mode = gzip.open, 'rt' | ||
else: | ||
open_func, mode = open, 'r' | ||
with open_func(filename, mode) as f: | ||
for num, line in enumerate(f): | ||
if num == 1: # Get month and year from the 2nd line | ||
start_date = pd.Timestamp(year=int(line[7:11]), | ||
month=int(line[3:6]), day=1, | ||
tz='UTC') # BSRN timestamps are UTC | ||
AdamRJensen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if line.startswith('*'): # Find start of all logical records | ||
line_no_dict[line[2:6]] = num # key is 4 digit LR number | ||
|
||
# Determine start and end line of logical record LR0100 to be parsed | ||
start_row = line_no_dict['0100'] + 1 # Start line number | ||
# If LR0100 is the last logical record, then read rest of file | ||
if start_row-1 == max(line_no_dict.values()): | ||
end_row = num # then parse rest of the file | ||
else: # otherwise parse until the beginning of the next logical record | ||
end_row = min([i for i in line_no_dict.values() if i > start_row]) | ||
nrows = end_row-start_row | ||
|
||
# Read file as a fixed width file (fwf) | ||
data = pd.read_fwf(filename, skiprows=start_row, nrows=nrows, header=None, | ||
colspecs=COL_SPECS, na_values=[-999.0, -99.9]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be possible to avoid reading the file twice by accumulating the first read of just the '0100' section into an IO buffer, stopping the read at the end of the section, and then passing that buffer into I think this might result in meaningful performance improvements when reading many files. Or maybe we need to profile the code before we put the effort into it. In any case, I'm ok saving this idea for future work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @wholmgren The initial opening of the file and looping through all of the lines takes ~414 ms, whereas the pd.read_fwf takes 1.15 s. Tested in a Jupyter Notebook using the %%timeit command. Other parts of the code take on the order of µs (except setting the datetime index 3 ms). So there are potential savings, however, after searching for a while I still have no idea of how to write to a buffer line by line. I'm also very fine with saving this idea for future work though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for putting in the effort to profile the code! The potential speed up seems modest so let's save it for future work. |
||
|
||
# Create multi-index and unstack, resulting in one column for each variable | ||
data = data.set_index([data.index // 2, data.index % 2]) | ||
data = data.unstack(level=1).swaplevel(i=0, j=1, axis='columns') | ||
|
||
# Sort columns to match original order and assign column names | ||
data = data.reindex(sorted(data.columns), axis='columns') | ||
data.columns = BSRN_COLUMNS | ||
# Drop empty columns | ||
data = data.drop('empty', axis='columns') | ||
|
||
# Change day and minute type to integer | ||
data['day'] = data['day'].astype('Int64') | ||
data['minute'] = data['minute'].astype('Int64') | ||
|
||
# Set datetime index | ||
data.index = (start_date | ||
+ pd.to_timedelta(data['day']-1, unit='d') | ||
+ pd.to_timedelta(data['minute'], unit='min')) | ||
|
||
return data |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
""" | ||
tests for :mod:`pvlib.iotools.bsrn` | ||
""" | ||
|
||
|
||
import pandas as pd | ||
import pytest | ||
|
||
from pvlib.iotools import bsrn | ||
from conftest import DATA_DIR | ||
|
||
|
||
# Awaiting permission from BSRN to upload test file | ||
testfile = DATA_DIR / 'bsrn-pay0616.dat.gz' | ||
|
||
|
||
def test_read_bsrn_columns(): | ||
data = bsrn.read_bsrn(testfile) | ||
assert 'ghi' in data.columns | ||
assert 'dni_std' in data.columns | ||
assert 'dhi_min' in data.columns | ||
assert 'lwd_max' in data.columns | ||
assert 'relative_humidity' in data.columns | ||
|
||
|
||
@pytest.fixture | ||
def expected_index(): | ||
start = pd.Timestamp(2016, 6, 1, 0, 0) | ||
return pd.date_range(start=start, periods=43200, freq='1min', tz='UTC') | ||
|
||
|
||
def test_format_index(): | ||
actual = bsrn.read_bsrn(testfile) | ||
assert actual.index.equals(expected_index()) | ||
AdamRJensen marked this conversation as resolved.
Show resolved
Hide resolved
|
Uh oh!
There was an error while loading. Please reload this page.