Merge pull request #4 from MSKirk/wip_raphael

MSKirk · web-flow · commit 14e19244441a · 2019-10-07T11:01:17.000-04:00
Added in changes from @WaaallEEE
diff --git a/README.txt b/README.txt
@@ -1,11 +1,15 @@
 To handle the overlapping events and associated duplicated files accross consecutive months,
 you will find:
-(1) an aggregated map of all overlap-free events in a csv file. This only shows file basenames
-(2) csv files for mapping the data relative paths into a single parent directory.
-All these maps get rid of the overlapping events and duplicated files by keeping only one
-wherever they occur.
 
-The aggregation script to produce these csv files is available
+(1) an aggregated map of all duplicate-free events in a csv file. This only shows file basenames:
+label_jp2_map_global.csv
+
+(2) csv files for mappin, copying or moving duplicate-free data relative paths into a single parent directory:
+- map_non_duplicated_jp2_paths.csv
+- map_non_duplicated_labels_paths.csv
+
+
+The aggregation script to produce these csv files using the data on the original disk is available
 at https://github.com/MSKirk/MachineLearning/blob/master/script_aggregation.py
 
 
diff --git a/calibration.py b/calibration.py
@@ -16,11 +16,12 @@ def scale_rotate(image, angle=0, scale_factor=1, reference_pixel=None):
     """
     Perform scaled rotation with opencv. About 20 times faster than with Sunpy & scikit/skimage warp methods.
     The output is a padded image that holds the entire rescaled,rotated image, recentered around the reference pixel.
-    Positive-angle rotation will go counterclockwise if the array is displayed with the origin on top (default),
-    and clockwise with the origin at bottom.
+    Positive-angle rotation rotates image clockwise if the array origin (0,0) map to the bottom left of the image,
+    and counterclockwise if the array origin map to the top left of the image.
 
     :param image: Numpy 2D array
-    :param angle: rotation angle in degrees. Positive angle  will rotate counterclocwise if array origin on top-left
+    :param angle: rotation angle in degrees. Positive-angle rotation rotates image clockwise if the array origin (0,0)
+    map to the bottom left of the image, and counterclockwise if the array origin map to the top left of the image.
     :param scale_factor: ratio of the wavelength-dependent pixel scale over the target scale of 0.6 arcsec
     :param reference_pixel: tuple of (x, y) coordinate. Given as (x, y) = (col, row) and not (row, col).
     :return: padded scaled and rotated image
diff --git a/images/2011_06_25__00_59_43_71__SDO_AIA_AIA_1700.jp2 b/images/2011_06_25__00_59_43_71__SDO_AIA_AIA_1700.jp2
diff --git a/read_jp2.py b/read_jp2.py
@@ -23,18 +23,19 @@ def read_solar_jp2(filepath, verbose=False):
     img = sunpy.io.read_file(filepath, filetype='jp2')[0]
     prepped_header = img.header
 
-    # Rotation of image to Solar North
+    # Rotation of image to get vertical y-axis Top-to-Bottom parallel to Solar North-to-South axis.
     if img.header['CROTA2'] != 0:
         if verbose:
             print('Rotating image to solar north')
         prepped_data = calibration.scale_rotate(img.data, img.header['CROTA2'])
+        prepped_header['CROTA2'] = 0
 
         center = ((np.array(prepped_data.shape) - 1) / 2.0).astype(int)
         half_size = int(aia_image_size / 2)
-        prepped_data = prepped_data[center[1] - half_size:center[1] + half_size, center[0] - half_size:center[0] + half_size]
-        prepped_header['CROTA2'] = 0
+        prepped_data = prepped_data[center[1] - half_size:center[1] + half_size, center[0] - half_size:center[0] + half_size].astype(np.float64)
+
     else:
-        prepped_data = img.data
+        prepped_data = img.data.astype(np.float64)
 
     # Normalizing the image intensity to levels at the start of the mission for AIA
     if 'AIA' in img.header['INSTRUME']:
diff --git a/sanity_checks/check_read_solar_jp2.py b/sanity_checks/check_read_solar_jp2.py
@@ -0,0 +1,35 @@
+import os
+from read_jp2 import read_solar_jp2
+import matplotlib
+matplotlib.use('Tkagg')
+import matplotlib.pyplot as plt
+import numpy as np
+import calibration
+
+# Get a jp2 sample. here using the jp2 included in this github repo
+filepath = '../images/2011_06_25__00_59_43_71__SDO_AIA_AIA_1700.jp2'
+
+pdata, pheader = read_solar_jp2(filepath)
+pdata[0:1000, :] = 0
+
+rdata = calibration.scale_rotate(pdata, 45)
+
+# Display the image and make sure it's in the correct orientation with respect to the png sample
+vmax = np.percentile(pdata, 99.5)
+
+fs = 20
+plt.figure(0, figsize=(18, 18))
+plt.subplot(2, 2, 1)
+plt.imshow(pdata, vmin=pdata.min(), vmax=vmax, origin='lower', cmap='gray')
+plt.title('origin lower, no rotation', fontsize=fs)
+plt.subplot(2, 2, 2)
+plt.imshow(rdata, vmin=pdata.min(), vmax=vmax, origin='lower', cmap='gray')
+plt.title('origin lower, rotation argument +45 deg', fontsize=fs)
+plt.subplot(2, 2, 3)
+plt.imshow(pdata, vmin=pdata.min(), vmax=vmax, cmap='gray')
+plt.title('origin top, no rotation', fontsize=fs)
+plt.subplot(2, 2, 4)
+plt.imshow(rdata, vmin=pdata.min(), vmax=vmax, cmap='gray')
+plt.title('origin top, rotation argument +45 deg', fontsize=fs)
+plt.tight_layout()
+plt.show()
diff --git a/script_aggregation.py b/script_aggregation.py
@@ -5,11 +5,10 @@
 (1) Merges the content of the csv files found in all YEAR_MONTH subdirectories into a single global_csv_file.
 This file will map all jp2 to their labels without any overlaps. This csv only map file basenames.
 
-(2) This script also creates csv files mapping relative YEAR_MONTH-based file path to new common jp2 and labels directory
-directly under a new  parent directory. You may this e.g. for moving the files into a new more "global" tree.
+(2) This script also creates csv files mapping relative YEAR_MONTH-based file path to new common jp2 and labels
+directly under a new  parent directory. You may run this e.g. for moving the files into a new more "global" tree.
 
-E.g: given a parent directory "parent_dir" hosting the original data tree (parent_dir),
-the csv file will map everything in
+E.g: given a parent directory "parent_dir", the csv file will map things from
 
 parent_dir
     2010_12
@@ -47,26 +46,43 @@
 import glob
 import pandas as pd
 import csv
+from pathlib import Path
 
 
 #############   Set some data directories - update to your personal case  #############
 
 # Parent directory of all YEAR_MONTH subdirectories that will also contain the global csv file
-parent_dir = '/Volumes/SolarData/LabeledImages/'
+parent_dir = '/media/raphael/SolarData/V2'
 # Common directory where all files will be moved, without duplicates.
 parent_dir2 = parent_dir
 # Filename of csv file that will be the aggregation all csv files of all YEAR_MONTH subdirectories without duplicates
-global_csv_file = os.path.join(parent_dir, 'label_jp2_map_global.csv')
+global_csv_file = os.path.join(parent_dir2, 'label_jp2_map_global.csv')
+# csv output for empty data
+csv_empty_data = os.path.join(parent_dir2, 'empty_data.csv')
+
 
 ######### (1) Creating the aggregated map of jp2 and label masks ###########
 
 # Fetch the csv file paths recursively
 csv_files = sorted(glob.glob(os.path.join(parent_dir, '20*/label_jp2_map.csv')))
 # Read their content and concatenate in a unique dataframe
 dfs = []
+empty_csvs = []
 for csvf in csv_files:
     print(csvf)
-    dfs.append(pd.read_csv(csvf, header=None))
+    try:
+        # Sometimes the CSV file can be empty if no complete set exist at all
+        dfs.append(pd.read_csv(csvf, header=None))
+    except pd.errors.EmptyDataError:
+        print('Empty csv file')
+        # Write to file the parent directory of the empty data
+        empty_csvs.append([Path(csvf).parent.name])
+        continue
+
+with open(csv_empty_data, 'w') as csvFile:
+    writer = csv.writer(csvFile)
+    writer.writerows(empty_csvs)
+csvFile.close()
 
 # Concatenate the dataframes into a single one while dropping all duplicates
 label_jp2_map_global = pd.concat(dfs).drop_duplicates().reset_index(drop=True)
@@ -96,7 +112,7 @@
 jp2f_csv = os.path.join(parent_dir2, 'map_non_duplicated_jp2_paths.csv')
 labels_csv = os.path.join(parent_dir2, 'map_non_duplicated_labels_paths.csv')
 
-# Map the jp2 files
+# Map the jp2 files of each sub-directories into a single list
 new_files = []
 for file in jp2f:
     new_file = os.path.join(jp2_dir, os.path.basename(file))
@@ -108,7 +124,7 @@
         jp2f_list.append([original_file_relative, new_file_relative])
         new_files.append(new_file)
 
-# Write the csv file mapping the jp2 YEAR_MONTH-based path to new common directory
+# Write the csv file mapping the jp2 YEAR_MONTH-based path to a new single directory
 with open(jp2f_csv, 'w') as csvFile:
     writer = csv.writer(csvFile)
     writer.writerows(jp2f_list)
@@ -129,7 +145,7 @@
         labels_list.append([original_file_relative, new_file_relative])
         new_files.append(new_file)
 
-# Create the restore csv of .npz files (including png files) mapping the .npz and png YEAR_MONTH-based path to new common directory
+# Create the csv mapping the .npz and png YEAR_MONTH-based path to new common directory
 with open(labels_csv, 'w') as csvFile:
     writer = csv.writer(csvFile)
     writer.writerows(labels_list)
diff --git a/script_over_days.py b/script_over_days.py
@@ -4,8 +4,8 @@
 
 if __name__ == '__main__':
 
-    save_dir = os.path.abspath(os.path.expanduser('/Users/mskirk/Desktop/MLDataTest/test'))
-
+    save_dir = os.path.abspath(os.path.expanduser('~/Data/ML_projects/aia_recognition'))
+    # DO NOT DOWNLOAD ANYTHING BEFORE 2010/12 (no HMI data in helioviewer)
     tstart = '2011/06/25 00:00:00'
     tend = '2011/06/25 23:30:00'
     j = Jpd.Jp2ImageDownload(save_dir, tstart=tstart, tend=tend)
diff --git a/script_over_many_months.py b/script_over_many_months.py
@@ -11,6 +11,8 @@
     # SET THESE PARAMETERS:
     save_dir = os.path.abspath('/Users/mskirk/Desktop/MLDataTest')
     #save_dir = os.path.abspath('/Volumes/RAPH_1TB/Data/Michael/Hek_project')
+
+    # DO NOT DOWNLOAD ANYTHING BEFORE 2010/12 (no HMI data in helioviewer)
     start_date = '2017/01/01 00:00:00'  # inclusive
     end_date = '2017/06/01 00:00:00'  # not inclusive
 
diff --git a/script_over_many_months_raphael.py b/script_over_many_months_raphael.py
@@ -8,16 +8,20 @@
 
 if __name__ == '__main__':
 
-    # Update to your own case
-    save_dir = os.path.abspath('/Volumes/RAPH_1TB/Data/Michael/Hek_project')
-    start_date = '2018/05/01 00:00:00'  # inclusive
-    end_date = '2019/02/01 00:00:00'  # not inclusive
+    # SET THESE PARAMETERS:
+    save_dir = os.path.abspath('~/Data/ML_projects/HEK_project')
 
+    # DO NOT DOWNLOAD ANYTHING BEFORE 2010/12 (no HMI data in helioviewer)
+    start_date = '2017/01/01 00:00:00'  # inclusive
+    end_date = '2017/06/01 00:00:00'  # not inclusive
 
-    begin_list = [dt for dt in rrule(MONTHLY, dtstart=parse_time(start_date), until=parse_time(end_date))]
+    # SHOULDN'T NEED TO CHANGE ANYTHING BELOW THIS --------->>
+
+    begin_list = [dt for dt in rrule(MONTHLY, dtstart=parse_time(start_date).to_datetime(), until=parse_time(end_date).to_datetime())]
     end_list = [elem - datetime.timedelta(minutes=30) for elem in begin_list[1:]]
     del begin_list[-1]
 
+
     for tstart, tend in zip(begin_list, end_list):
 
         j = Jpd.Jp2ImageDownload(save_dir, tstart=tstart, tend=tend)
@@ -45,5 +49,3 @@
                 print('HEK server error during make_labels(). Trying again...')
                 logging.warning('HEK server error raised ConnectionResetError during make_labels()')
                 continue
-
-