From 85b963fe50e9cec802aa7e89c9e56b9dfe37eb0b Mon Sep 17 00:00:00 2001
From: Sophie Hourihane <sophie.hourihane@ligo.org>
Date: Wed, 19 Jun 2024 11:10:06 -0700
Subject: [PATCH] Adding --thin-by as an argument to BayesWaveCleanFrame

---
 src/BayesWaveCleanFrame.c | 171 +++++++++++++++++++++++---------------
 1 file changed, 104 insertions(+), 67 deletions(-)

diff --git a/src/BayesWaveCleanFrame.c b/src/BayesWaveCleanFrame.c
index d8d2b97..f8060b5 100644
--- a/src/BayesWaveCleanFrame.c
+++ b/src/BayesWaveCleanFrame.c
@@ -81,6 +81,7 @@ struct CleanFrameData
 
   int fairdraw_index;    //index of fair draw (note, this is 0-indexed.
                          // The point must be from the second half of the chain file)
+  int thin_by;           // factor by which to thin chains (default 1)
   char bw_model[MAXSTRINGSIZE]; //bayeswave glitch model
 
   double bw_seglen;   //bayeswave segment length
@@ -107,6 +108,49 @@ void printProgress (double percentage)
 }
 
 
+int getNumberOfLinesInFile(FILE *paramsfile) {
+  // First, count the number of lines in the file.
+  int line_count = 0;
+  char ch;
+
+  // Save the current position of the file pointer.
+  long initial_pos = ftell(paramsfile);
+
+  // Count lines by reading character by character
+  while ((ch = fgetc(paramsfile)) != EOF) {
+    if (ch == '\n') {
+      line_count++;
+    }
+  }
+  // Reset the file pointer to the beginning of the file.
+  fseek(paramsfile, initial_pos, SEEK_SET);
+  return line_count;
+}
+
+void jumpNLinesInFile(FILE *paramsfile, int line_number) {
+    if (line_number < 0) {
+      fprintf(stderr, "ERROR: Line number cannot be negative.\n");
+      exit(1);
+    }
+
+    char *line = NULL;
+    size_t len = 0;
+    ssize_t read;
+    int current_line = 0;
+    //fprintf(stdout, "Jumping %i lines in file.\n", line_number);
+    while (current_line < line_number) {
+      read = getline(&line, &len, paramsfile);
+      if (read == -1) {
+        fprintf(stderr, "ERROR: Could not jump to line %d in file. Reached EOF.\n", line_number);
+        free(line);  // Free the allocated buffer before exiting
+        exit(1);
+      }
+      current_line++;
+    }
+    free(line);  // Free the allocated buffer after use
+}
+
+
 /* ============================  MAIN PROGRAM  ============================ */
 
 
@@ -135,7 +179,7 @@ int main(int argc, char *argv[]) {
   fprintf(stdout, "\n");
 
   /*   Variable declaration   */
-  int i, j, k, NGWF;
+  int j, k, NGWF;
 
   double x;
   double dT;
@@ -222,72 +266,58 @@ int main(int argc, char *argv[]) {
 
   FILE *paramsfile = fopen(data->bw_model, "r");
 
+  int N_chain_samples = getNumberOfLinesInFile(paramsfile);
+  fprintf(stdout, "Number of lines in file: %i\n", N_chain_samples);
+
   int glitch_model_size;
   double *glitch_model_params = malloc(NP * sizeof(double));
 
-  //count lines in file
-  int N_chain_samples = 0;
-  while(!feof(paramsfile))
-  {
-    fscanf(paramsfile, "%i", &glitch_model_size);
-    for(int d=1; d<=glitch_model_size; d++)
-    {
-      for(i=0; i<NP; i++) fscanf(paramsfile,"%lg", &glitch_model_params[i]);
-    }
-    N_chain_samples++;
-  }
-  rewind(paramsfile);
-  N_chain_samples--;
-
-
-  //get rid of burnin samples
+  //get rid of burnin samples by placing file pointer halfway down
   int N_burnin = N_chain_samples/2;
-  for(int n=0; n<N_burnin; n++)
-  {
-    fscanf(paramsfile, "%i", &glitch_model_size);
-    if(glitch_model_size>0)
-    {
-      for(int d=1; d<=glitch_model_size; d++)
-      {
-        for(i=0; i<5; i++) fscanf(paramsfile,"%lg", &glitch_model_params[i]);
-      }
-    }
+  int N_after_burnin = N_chain_samples - N_burnin;
+  int NWaveforms = 1;
+  if (data->medianFlag){
+    NWaveforms = N_after_burnin / data->thin_by;
   }
 
   //setup array to store post-burnin wavelets
-  for(i=0; i<NBW; i++)
+  for(int i=0; i<NBW; i++)
   {
-    glitch_model_posterior[i] = malloc(sizeof(double)*N_burnin);
-    for(int n=0; n<N_burnin; n++) glitch_model_posterior[i][n] = 0.0;
+    glitch_model_posterior[i] = malloc(sizeof(double) * NWaveforms);
+    for(int n=0; n<NWaveforms; n++) glitch_model_posterior[i][n] = 0.0;
   }
 
   char filename[128];
   FILE *tempfile;
 
-  // when running with fairdraw, we need to pick a sample at random
-  //pick a sample at "random" e.g. final line
-  int fair_draw = N_burnin - 1;
-  if (data->fairdraw_index >= 0)
+  // set fairdraw index to be last sample if not specified
+  int fairdraw_index = data->fairdraw_index < 0 ? N_chain_samples - 1 : data->fairdraw_index;
+
+  int index = 0;
+
+  if (data->medianFlag)
   {
-    if ((data->fairdraw_index - N_burnin) < 0)
-    {
-      fprintf(stderr, "ERROR: fairdraw index %i must be in second half of samples (there are %i samples)\n", data->fairdraw_index, N_chain_samples);
-      exit(1);
-    }
-    fair_draw = data->fairdraw_index - N_burnin;
+    index = N_burnin;
+    jumpNLinesInFile(paramsfile, N_burnin);
+  }
+  else {
+    index = fairdraw_index;
+    jumpNLinesInFile(paramsfile, fairdraw_index);
   }
 
   //parse posterior samples and generate waveforms
   printf("Generating glitch model posterior:\n");
-  for(int n=0; n<N_burnin; n++)
+  for(int n=0; n<NWaveforms; n++)
   {
-    for(i=0; i<NBW; i++) glitch_model[i] = 0.0;
+    for(int i=0; i<NBW; i++) glitch_model[i] = 0.0;
     
     fscanf(paramsfile, "%i", &glitch_model_size);
 
-    if (n == fair_draw && !(data->medianFlag)){
+    //fprintf(stdout, "Index is %i glitch model size: %i params:", index + n * data->thin_by, glitch_model_size);
+
+    if (n == 0 && !(data->medianFlag)){
       if (glitch_model_size == 0){
-        fprintf(stderr, "WARNING! Fair draw requested at index %i, but the glitch is 0 dimensional there\n", fair_draw + N_burnin);
+        fprintf(stderr, "WARNING! Fair draw requested at index %i, but the glitch is 0 dimensional there\n", fairdraw_index);
       }
     }
     
@@ -295,18 +325,14 @@ int main(int argc, char *argv[]) {
     {
       for(int d=1; d<=glitch_model_size; d++)
       {
-        for(i=0; i<NP; i++) fscanf(paramsfile,"%lg", &glitch_model_params[i]);
-        if ((data->medianFlag) || (n == fair_draw))
-        {
+        for(int i=0; i<NP; i++) fscanf(paramsfile,"%lg", &glitch_model_params[i]);
+          //fprintf(stdout, "t: %f f: %f Q: %f A: %e phi: %f ", glitch_model_params[0], glitch_model_params[1], glitch_model_params[2], glitch_model_params[3], glitch_model_params[4]);
           SineGaussianTime(glitch_model, glitch_model_params, NBW, 1, data->bw_seglen);
-        }
       }
-
-      //copy glitch_model to master glitch array for safe keeping
-      for(i=0; i<NBW; i++) glitch_model_posterior[i][n] = glitch_model[i];
+      for(int i=0; i<NBW; i++) glitch_model_posterior[i][n] = glitch_model[i];
     }
-    
-    printProgress((double)(n+1)/(double)N_burnin);
+    printProgress((double)(n+1)/(double)NWaveforms);
+    jumpNLinesInFile(paramsfile, data->thin_by);
   }
   printf("\n");
   
@@ -314,16 +340,16 @@ int main(int argc, char *argv[]) {
   //now, reuse glitch_model to store the reconstruction for subtraction
   if(data->medianFlag)
   {
-    for(i=0; i<NBW; i++)
+    for(int i=0; i<NBW; i++)
     {
-      gsl_sort(glitch_model_posterior[i], 1, N_burnin);
-      glitch_model[i] = gsl_stats_median_from_sorted_data(glitch_model_posterior[i], 1, N_burnin);
+      gsl_sort(glitch_model_posterior[i], 1, NWaveforms);
+      glitch_model[i] = gsl_stats_median_from_sorted_data(glitch_model_posterior[i], 1, NWaveforms);
     }
   }
   else
   {
-    for(i=0; i<NBW; i++)
-      glitch_model[i] = glitch_model_posterior[i][fair_draw];
+    for(int i=0; i<NBW; i++)
+      glitch_model[i] = glitch_model_posterior[i][0];
   }
   
   /*
@@ -337,10 +363,10 @@ int main(int argc, char *argv[]) {
   k = j+NBW;
 
   //zero pad array
-  for(i=0; i<NGWF; i++) glitch_model_padded[i] = 0.0;
+  for(int i=0; i<NGWF; i++) glitch_model_padded[i] = 0.0;
 
   //place glitch model in right part of full array
-  for(i=j; i< k; i++)
+  for(int i=j; i< k; i++)
   {
     if(i>=0 && i<NGWF && (i-j)>=0 && (i-j)<NBW)
       glitch_model_padded[i] = glitch_model[i-j];
@@ -354,7 +380,7 @@ int main(int argc, char *argv[]) {
 
 
   // Initialise and populate clean data
-  for(i=0; i<NGWF; i++)
+  for(int i=0; i<NGWF; i++)
   {
     //fill LAL data structure with cleaned data for frame-making
     timeRes->data->data[i] = timeData->data->data[i] - glitch_model_padded[i];
@@ -367,7 +393,7 @@ int main(int argc, char *argv[]) {
   if(data->verboseFlag)
   {
     vFile = fopen("ingredients.dat","w");
-    for(i=j; i<k; i++)
+    for(int i=j; i<k; i++)
     {
       fprintf(vFile,"%lg %lg %lg %lg\n",(double)(i-j)*dT,timeData->data->data[i],timeGlitch->data->data[i],timeRes->data->data[i]);
     }
@@ -454,8 +480,6 @@ static void output_frame(REAL8TimeSeries *timeData,
 
 void parse_command_line_args(int argc, char **argv, struct CleanFrameData *data)
 {
-  int i;
-
   if(argc==1)
   {
     print_usage();
@@ -463,13 +487,14 @@ void parse_command_line_args(int argc, char **argv, struct CleanFrameData *data)
   }
 
   FILE *outfile = fopen("bayeswave_cleanframe.run","w");
-  for(i=0; i<argc; i++) fprintf(outfile,"%s ",argv[i]);
+  for(int i=0; i<argc; i++) fprintf(outfile,"%s ",argv[i]);
   fprintf(outfile,"\n");
   fclose(outfile);
 
   data->medianFlag  = 0;
   data->verboseFlag = 0;
   data->fairdraw_index = -1; // default to -1 meaning we take the last sample
+  data->thin_by = 1;
 
   static struct option long_options[] =
   {
@@ -486,6 +511,7 @@ void parse_command_line_args(int argc, char **argv, struct CleanFrameData *data)
     {"clean-suffix",  required_argument, 0, 0},
     {"outdir",        required_argument, 0, 0},
     {"fairdraw-index",required_argument, 0, 0},
+    {"thin-by",       required_argument, 0, 0},
     {"median",        no_argument, 0, 0},
     {"verbose",       no_argument, 0, 0},
     {"help",          no_argument, 0,'h'},
@@ -497,7 +523,7 @@ void parse_command_line_args(int argc, char **argv, struct CleanFrameData *data)
 
   int argCount = 0;
   int argcheck[REQARG];
-  for(i=0; i<REQARG; i++) argcheck[i] = 0;
+  for(int i=0; i<REQARG; i++) argcheck[i] = 0;
 
   //Loop through argv string and pluck out arguments
   while ((opt = getopt_long_only(argc, argv,"apl:b:", long_options, &long_index )) != -1)
@@ -596,6 +622,16 @@ void parse_command_line_args(int argc, char **argv, struct CleanFrameData *data)
             exit(1);
           }
         }
+        if(strcmp("thin-by",long_options[long_index].name) == 0)
+        {
+          data->thin_by = atoi(optarg);
+          fprintf(stdout, "setting thin-by to %i\n", data->thin_by);
+          if (data->thin_by < 1)
+          {
+            fprintf(stderr, "ERROR: When supplied, thin_by must be a positive integer <= 1, given instead %i\n", data->thin_by);
+            exit(1);
+          }
+        }
         if(strcmp("help", long_options[long_index].name) == 0)
         {
           fprintf(stdout,"BayesWaveCleanFrame:\n");
@@ -618,7 +654,7 @@ void parse_command_line_args(int argc, char **argv, struct CleanFrameData *data)
   }
 
   //make sure there all the right arguments were used
-  for(i=0; i<REQARG; i++)
+  for(int i=0; i<REQARG; i++)
   {
     if(argcheck[i]!=1)
     {
@@ -669,6 +705,7 @@ void print_usage()
   fprintf(stdout,"--verbose       print intermediate data products to disk\n");
   fprintf(stdout,"--median        use median glitch reconstruction\n");
   fprintf(stdout,"--fairdraw-index Index of fairdraw in glitch-model file. Note that the index must be in the second half of the file\n");
+  fprintf(stdout,"--thin-by       When calculating median, only takes median from every thin-by samples (default 1)\n");
   fprintf(stdout,"                can only be used on a completed run\n");
   fprintf(stdout,"--help          print help output\n");
   fprintf(stdout,"\n");
-- 
GitLab