Article Figure 1 Figure 2 Figure 3 Figure 4
Figure 5 Listing 1 Listing 2 Listing 3 Listing 4 aug2004.tar

Listing 4 compare_counts.pl: Finding missing records from remote tables

#!/usr/bin/perl -w
# compare_counts
# (C) Giuseppe Maxia 2003
# Compare two tables in two remote databases, looking for missing records.
# If the global count is different, compares the tables group by group
# and returns the missing records.
# It does not find records that are missing in the same number, in both 
# tables and in the same group. For such cases, the find_differences script 
# must be used.
use strict;
use DBI;

# This script is for educational purposes only.
# Parameters are embedded in the script itself.
# For production, it woyld be better to use either
# a config file or any Getopt::XX modules
my $host1 = "localhost";
my $host2 = "192.168.59.9";

my $db1 = "xcompany";
my $db2 = "xcompany";
my $tablename = "employees";

# primary key of the table
my $PK = "employee_code";

# Grouping column, with eventually a formula to obtain it
# (Used in the field list of a SELECT statement)
my $grouping_col = qq{FLOOR(employee_code / 1204)};
# alias for grouping column, to be used in the GROUP BY clause
my $grouping_alias = "empcode";

my ( $user1, $password1 ) = ( 'myuser', 'secret' );
my ( $user2, $password2 ) = ( 'myuser', 'secret' );

my $dbh1 =
  DBI->connect( "DBI:mysql:$db1;" . "host=$host1", $user1, $password1,
    { RaiseError => 1, PrintError => 0 } )
  or die "can't connect : $DBI::errstr";

my $dbh2 =
  DBI->connect( "DBI:mysql:$db2;" . "host=$host2", $user2, $password2,
    { RaiseError => 1, PrintError => 0 } )
  or die "can't connect : $DBI::errstr";

unless ( grep { $_ eq $tablename } $dbh1->tables() ) {
    die "$tablename not found in $db1\n";
}

unless ( grep { $_ eq $tablename } $dbh2->tables() ) {
    die "$tablename not found in $db2\n";
}

compare_counts();

$dbh1->disconnect();
$dbh2->disconnect();

sub compare_counts {
    my $count_query = qq{SELECT COUNT(*) FROM $tablename};
    my ($count1) = @{ $dbh1->selectcol_arrayref($count_query) };
    my ($count2) = @{ $dbh2->selectcol_arrayref($count_query) };
    if ( $count2 == $count1 ) {
        print "Table $tablename in $db1 and in $db2 ",
            "have the same number of records\n";
        return;
    }
    my $get_counts_query = 
      qq{SELECT $grouping_col AS $grouping_alias, COUNT(*) FROM $tablename 
         GROUP BY $grouping_alias };
    my $get_PK_query = qq{SELECT $PK FROM $tablename WHERE $grouping_col = ? };

    # gets the counts for each range
    my $set1         = get_counts( $dbh1, $get_counts_query );
    my $set2         = get_counts( $dbh2, $get_counts_query );
    my %differences  = ();

    # finds the groups in server2 missing from server 1
    for ( keys %$set2 ) {
        if ( not exists $set1->{$_} ) {
            my $missing_columns =
              get_columns( $dbh2, $get_PK_query, $set2->{$_}->[0] );
            print "missing from $db1 (whole group $set2->{$_}->[0] ):",
              " @$missing_columns\n";
        }
    }
    # finds the groups in server1 missing from server2
    for ( keys %$set1 ) {
        my ( $dbh, $db, $columns ) = ( undef, undef, undef );
        if ( not exists $set2->{$_} ) {
            my $missing_columns =
              get_columns( $dbh1, $get_PK_query, $set1->{$_}->[0] );
            print "missing from $db2 (whole group $set1->{$_}->[0] ):",
              " @$missing_columns\n";
        }
        # establishes which set is smaller and gets the primary key from
        # the lesser one.
        elsif ( $set1->{$_}[1] > $set2->{$_}[1] ) {
            $columns = get_columns( $dbh2, $get_PK_query, $set2->{$_}->[0] );
            $db      = $db1;
            $dbh     = $dbh1;
        }
        elsif ( $set2->{$_}[1] > $set1->{$_}[1] ) {
            $columns = get_columns( $dbh1, $get_PK_query, $set1->{$_}->[0] );
            $db      = $db2;
            $dbh     = $dbh2;
        }
        # if the count is the same for this group, do nothing
        else {
            next;
        }
        # gets the missing record from the server with the most 
        # records in this group by comparing with the list taken 
        # from the lesser server
        my $group                = $set1->{$_}->[0];
        my $get_missing_PK_query =
            qq{SELECT $PK FROM $tablename WHERE $grouping_col = $group 
            AND $PK NOT IN (}
          . join ( ",", map { $dbh->quote($_) } @$columns ) . ")";
        my $missing_cols = $dbh->selectcol_arrayref($get_missing_PK_query);
        print "missing from ", ( ( $db eq $db1 ) ? $db2 : $db1 ),
          " (missing lines): @$missing_cols\n";
    }
}

# returns the first column of one query with a specific parameter
sub get_columns {
    my ( $dbh, $query, $param ) = @_;
    my $sth = $dbh->prepare($query);
    $sth->execute($param);
    my @columns = map { $_->[0] } @{ $sth->fetchall_arrayref() };
    return \@columns;
}

# returns the set of counts converted into a hash
sub get_counts {
    my ( $dbh, $query ) = @_;
    my $sth = $dbh->prepare($query);
    $sth->execute();
    my %set = ();
    while ( my $row = $sth->fetchrow_arrayref() ) {
        $set{ $row->[0] } = [ $row->[0], $row->[1] ];
    }
    return \%set;
}