Information Technology Grimoire

Version .0.0.1

IT Notes from various projects because I forget, and hopefully they help you too.

Perl Script to Splice CSV Files

If you happen to have a huge data set CSV and want to split them out into a smaller subset you can use this Perl script to splice CSV files.

You can only splice text files, and the column orders need to be the same, but it’s useful when you want to run a test on some sample code instead of running a test on millions of lines of CSV data.

If you want Custom Perl Scripting please contact us so we can give you a quote on your project.

Here is how to use a Perl script to splice CSV files.

#!/usr/bin/perl

use warnings;               # standard best practices
use strict;                 # standard best practices
use Benchmark;              # in case we need to optimize
use Getopt::Std;            # for CLI options

#-----------------------------------------------------------
# SCRIPT START - Do not touch below
#-----------------------------------------------------------
$|++;
my @keeps;
my $lines;
my $input;
my $begin;
my $end;
my $header;
my %options;

#-----------------------------------------------------------
# LISTFILES: lists files in current directory
#-----------------------------------------------------------
sub listfiles {
    print "\nFiles We Found:\n";

    opendir my $dir, "." or die "Cannot open directory: $!";

    while ( my $node = readdir($dir) ) {
        next if $node =~ /^\./;
        print "$node\n";
    }

    closedir $dir;
}

#-----------------------------------------------------------
# USAGE: Educates user on how to run program, then lists files
#-----------------------------------------------------------

sub usage {
    my $msg = shift;
    print "$msg" . "\n\n";
    print "Usage: $0" . ' -i <somefile.csv> -b <begin> -e <end> -h <0|1>' . "\n\n";

    print "-i should be a csv text file\n";
    print "-b is the line number you want to begin with\n";
    print "-e is the line number you want to end with\n";
    print "-h is a boolean 0 or 1, do you want to include header\n\n";
    listfiles();
    exit;
}

#-----------------------------------------------------------
# Begin error checking and input
#-----------------------------------------------------------
# start timer
my $t0 = new Benchmark;
#--------------------------------
# Make sure at least one argument provided
if ( !@ARGV ) { usage(); }

# What are the options they entered
getopt("ibeh", \%options);

# check start line
if (defined $options{b}) {
    $begin = $options{b};
    unless ($begin >= 0) {
        usage("Not designed to have negative beginning")
    }
} else {
    usage();
}

# check stop line
if (defined $options{e}) {
    $end = $options{e};
    unless ($begin < $end) {
        usage("Cannot start after your file ends!")
    }
} else {
    usage();
}

# do we want a header (line 0)?
if (defined $options{h}) {
    $header = $options{h};
} else {
    usage("Do you want to use first line as a header?  0 or 1");
}

print qq~Verifying Infile......~;
# test for the existence of the a text file
if (defined $options{i}) {
    $input = $options{i};
    if (-e $input) {
        if (-B $input) {
            usage("Input file is binary, should be text");
        }
    } else {
        usage("Input File Doesn't Exist")
    }
} else { usage(); }

my $outfile    = "$input" . "-processed.csv";     # build outfile names
#-----------------------------------------------------------
# Main Logic
#-----------------------------------------------------------

# tell them what's going on
my $t1 = new Benchmark; my $td = timediff($t1,$t0);
print "DONE!!  took '",timestr($td), "' seconds\n";
#--------------------------------
print qq~Counting Lines........~;

#-----------------------------------------------------------
# READ ENTIRE FILE, LINE BY LINE, FROM BEGIN to END
#-----------------------------------------------------------
# open file handle ($fh) for reading (<:)
open my $fh, "<:encoding(utf8)", $input or die "$!\n";
while (my $row = <$fh>) {
    $lines++;           # keep track of how many lines total
    chomp $row;         # strip off new line

    # capture header if line == 1
    if (($lines == 1) && ($header)){
        push @keeps, $row;
    } elsif ($lines == $begin) {
        push @keeps, $row;
    } elsif (($lines > $begin) && ($lines <= $end)){
        push @keeps, $row;
    } elsif ($lines > $end) { last; } else { next; }
 }
close $fh;

# notify user of what's going on
my $t2 = new Benchmark; $td = timediff($t2,$t1);
print "DONE!!  took '",timestr($td), "' seconds\n";
#--------------------------------
print qq~Create Output File....~;

open my $fh2, ">:encoding(utf8)", $outfile or die "$!\n";

    foreach my $line (@keeps) {
        print $fh2 "$line\n";
    }

close $fh2;

# notify user of what's going on
my $t3 = new Benchmark; $td = timediff($t3,$t2);
print "DONE!!  took '",timestr($td), "' seconds\n";
#--------------------------------
print "Counted '$lines' lines of data\n";

my $kept = @keeps;
print "KEPT: $kept\n";
Last updated on 10 Oct 2018
Published on 10 Oct 2018