Sunday, January 22, 2012

CSV files merge, based on series names...

This example illustrates how janalyse-series API can be use to merge CSV files together while respecting series names. It takes a set of directory containing CSV files, merge all series and write results in a new directory. The merge is only based on series names, not file names, so if for one series, different file names were used, then only one of them will be chosen in the final destination directory, the shortest one.
#!/bin/sh
exec jaseries -deprecation -savecompiled "$0" "$@"
!#
/*
 * Copyright 2011 David Crosson
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import fr.janalyse.series._
import java.io.File

if (args.size < 2) {
  println("usage : csv-merge srcDir1 ... srcDirN destNewDir")
  println("  Merge and reduce numeric series stored in CSV files")
  System.exit(0)
}

case class InputSeries(name:String, filebasename:String, series:Series[Cell])
def file(filename:String) = new File(filename)
def file(dirname:String, filename:String) = new File(dirname, filename)
def file(dir:File, filename:String) = new File(dir, filename)

def csvFileFilter(f:String):Boolean = {f.endsWith(".csv") ||f.endsWith(".jxtcsv")}
val basename1RE="""(.*)(?:[-_]\d+)[.].+"""r
val basename2RE="""(.*)[.].+"""r
def basename(filename:String) = filename match {
  case basename1RE(basename) => basename
  case basename2RE(basename) => basename
  case other => other
}

val toMerge = args.init
val destNewDir = file(args.last)

destNewDir.mkdirs()

// --- Read everything
var inputSeriesList=List.empty[InputSeries]
for( dirname<-toMerge ; 
     filename<-file(dirname).list filter csvFileFilter;
     (seriesname, series)<-CSV2Series.fromFile(file(dirname, filename))) {
  val filebasename = basename(filename)
  inputSeriesList = InputSeries(seriesname, filebasename, series)::inputSeriesList
}

// --- Merge and Reduce
val seriesGroupByName=inputSeriesList groupBy {is => is.name}
val mergedSeriesList = for( (seriesname, inputSeriesList4Name) <- seriesGroupByName) yield {
  val mergedseries   = inputSeriesList4Name.map(_.series) reduceLeft {_ <<< _}
  val mergedbasename = inputSeriesList4Name.map(_.filebasename).min
  (mergedseries, mergedbasename)
}

// --- Write everything
val mergedSeriesGroupByBasename = mergedSeriesList groupBy { case (_, basename) => basename}
for( (mergedbasename, mergedTuples) <- mergedSeriesGroupByBasename) {
  val seriesList = mergedTuples map {case (series,_) => series}
  CSV2Series.toFile(seriesList, file(destNewDir, mergedbasename+".csv"))
}

No comments:

Post a Comment