<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">=head1 NAME

Bio::JBrowse::HashStore - on-disk 2-level hash table

=head1 DESCRIPTION

Makes an on-disk hash table, designed to be easily read as static files over HTTP.

Makes a set of JSON-encoded files at paths based on a hash of the key.
For example:

  path/to/dir/29c/c14/fc.json
  path/to/dir/23f/5ad/11.json
  path/to/dir/711/7c1/29.json
  path/to/dir/5ec/b24/6a.json
  path/to/dir/4de/9ac/c6.json
  path/to/dir/41b/c43/27.json
  path/to/dir/28c/d86/e9.json

Where each file contains a JSON object containing data items like:

  { foo: "bar", ... }

Where "foo" is the original key, and "bar" is the JSON-encoded data.

=cut

package Bio::JBrowse::HashStore;
use strict;
use warnings;

use Carp;

use Storable ();
use JSON 2;

use File::Next ();
use File::Path ();
use File::Spec ();

use Digest::Crc32 ();
use DB_File ();
use IO::File ();
use POSIX ();

my $bucket_class = 'Bio::JBrowse::HashStore::Bucket';


=head2 open( dir =&gt; "/path/to/dir", hash_bits =&gt; 16, mem =&gt; 256 * 2**20 )

=cut

sub open {
    my $class = shift;

    # source of data: defaults, overridden by open args, overridden by meta.json contents
    my $self = bless {
        max_filehandles =&gt; 1000,
        mem =&gt; 256*2**20,
        @_
    }, $class;

    $self-&gt;{dir} or croak "dir option required";

    $self-&gt;empty if $self-&gt;{empty};

    $self-&gt;{meta} = $self-&gt;_read_meta;

    $self-&gt;{crc} = Digest::Crc32-&gt;new;

    # compress, format, and hash_bits all use the value in the
    # meta.json if present, or the value passed in by the constructor,
    # or the default, in order of priority
    my %defaults = ( compress =&gt; 0, format =&gt; 'json', hash_bits =&gt; 16 );
    for (qw( compress format hash_bits )) {
        $self-&gt;{$_} = $self-&gt;{meta}{$_} = (
            defined $self-&gt;{meta}{$_}  ?  $self-&gt;{meta}{$_}  :
            defined $self-&gt;{$_}        ?  $self-&gt;{$_}        :
                                          $defaults{$_}
        );
    }

    # check that hash_bits is a multiple of 4
    if( $self-&gt;{hash_bits} % 4 ) {
        die "Invalid hash bits value $self-&gt;{hash_bits}, must be a multiple of 4.\n";
    }

    $self-&gt;{hash_mask} = 2**($self-&gt;{hash_bits}) - 1;
    $self-&gt;{hash_sprintf_pattern} = '%0'.int( $self-&gt;{hash_bits}/4 ).'x';
    $self-&gt;{file_extension} = '.'.$self-&gt;{format};

    $self-&gt;{cache_size} = int( $self-&gt;{mem} / 50000 / 2 );
    print "Hash store cache size: $self-&gt;{cache_size} buckets\n" if $self-&gt;{verbose};

    File::Path::mkpath( $self-&gt;{dir} );

    return $self;
}

sub _make_cache {
    my ( $self, @args ) = @_;
    return Bio::JBrowse::HashStore::FIFOCache-&gt;new( @args );
}

# write out meta.json file when the store itself is destroyed
sub DESTROY {
    my ( $self ) = @_;
    File::Path::mkpath( $self-&gt;{dir} );
    {
        my $meta_path = $self-&gt;_meta_path;
        CORE::open my $out, '&gt;', $meta_path or die "$! writing $meta_path";
        $out-&gt;print( JSON::to_json( $self-&gt;{meta} ) )
            or die "$! writing $meta_path";
    }

    # free everything to flush buckets
    %$self = ();
}
sub _meta_path {
    File::Spec-&gt;catfile( shift-&gt;{dir}, 'meta.json' );
}
sub _read_meta {
    my ( $self ) = @_;
    my $meta_path = $self-&gt;_meta_path;
    return {} unless -r $meta_path;
    CORE::open my $meta, '&lt;', $meta_path or die "$! reading $meta_path";
    local $/;
    my $d = eval { JSON-&gt;new-&gt;relaxed-&gt;decode( scalar &lt;$meta&gt; ) } || {};
    warn $@ if $@;
    $d-&gt;{compress} = 0 unless defined $d-&gt;{compress};
    return $d;
}

=head2 meta

return a hashref of metadata about this hash store

=cut

sub meta {
    ( shift-&gt;{meta} ||= {} )
}

=head2 get( $key )

=cut

sub get {
    my ( $self, $key ) = @_;

    my $bucket = $self-&gt;_getBucket( $key );
    return $bucket-&gt;{data}{$key};
}

=head2 stream_do( $arg_stream, $operation_callback )

=cut

sub stream_do {
    my ( $self, $op_stream, $do_operation, $estimated_op_count ) = @_;

    # clean up any stale log files
    { my $log_iterator = $self-&gt;_file_iterator( sub { /\.log$/ } );
      while( my $stale_logfile = $log_iterator-&gt;() ) {
          unlink $stale_logfile;
      }
    }

    # make log files for each bucket, log the operations that happen
    # on that bucket, but don't actually do them yet
    my $ops_written = 0;
    my $gzip = $self-&gt;{compress} ? ':gzip' : '';
    {
        my $hash_chars = $self-&gt;{hash_bits}/4;
        my $sort_log_chars = $hash_chars - int( log($self-&gt;{cache_size} )/log(16) );
        my $max_sort_log_chars = int( log( $self-&gt;{max_filehandles} )/log(16) );
        $sort_log_chars = 1 unless $sort_log_chars &gt; 1;
        $sort_log_chars = $max_sort_log_chars unless $sort_log_chars &lt;= $max_sort_log_chars;

        $hash_chars -= $sort_log_chars;
        my $zeroes = "0"x$hash_chars;

        print "Using $sort_log_chars chars for sort log names (".(16**$sort_log_chars)." sort logs)\n" if $self-&gt;{verbose};
        my $filehandle_cache = $self-&gt;_make_cache( size =&gt; $self-&gt;{max_filehandles} );
        my $progressbar = $estimated_op_count &amp;&amp; $self-&gt;_make_progressbar( 'Sorting operations', $estimated_op_count );
        my $progressbar_next_update = 0;
        while ( my $op = $op_stream-&gt;() ) {
            my $hex = $self-&gt;_hex( $self-&gt;_hash( $op-&gt;[0] ) );

            substr( (my $log_hex = $hex), 0, $hash_chars, $zeroes );

            my $log_handle = $filehandle_cache-&gt;compute( $log_hex, sub {
                my ( $h ) = @_;
                my $pathinfo = $self-&gt;_hexToPath( $h );
                File::Path::mkpath( $pathinfo-&gt;{workdir} ) unless -d $pathinfo-&gt;{workdir};
                #warn "writing $pathinfo-&gt;{fullpath}.log\n";
                CORE::open( my $f, "&gt;&gt;$gzip", "$pathinfo-&gt;{workpath}.log" )
                    or die "$! opening bucket log $pathinfo-&gt;{workpath}.log";
                return $f;
            });

            Storable::store_fd( [$hex,$op], $log_handle );

            $ops_written++;
            if ( $progressbar &amp;&amp; $ops_written &gt;= $progressbar_next_update &amp;&amp; $ops_written &lt; $estimated_op_count ) {
                $progressbar_next_update = $progressbar-&gt;update( $ops_written );
            }
        }
        if ( $progressbar &amp;&amp; $ops_written &gt; $progressbar_next_update ) {
            $progressbar-&gt;update( $estimated_op_count );
        }
    }

    # play back the operations, feeding the $do_operation sub with the
    # bucket and the operation to be done
    {
        my $progressbar = $ops_written &amp;&amp; $self-&gt;_make_progressbar( 'Executing operations', $ops_written );
        my $progressbar_next_update = 0;
        my $ops_played_back = 0;
        my $log_iterator = $self-&gt;_file_iterator( sub { /\.log$/ } );
        while ( my $log_path = $log_iterator-&gt;() ) {
            CORE::open( my $log_fh, "&lt;$gzip", $log_path ) or die "$! reading $log_path";
            #warn "reading $log_path\n";
            while ( my $rec = eval { Storable::fd_retrieve( $log_fh ) } ) {
                my ( $hex, $op ) = @$rec;
                my $bucket = $self-&gt;_getBucketFromHex( $hex );
                $bucket-&gt;{data}{$op-&gt;[0]} = $do_operation-&gt;( $op, $bucket-&gt;{data}{$op-&gt;[0]} );
                $bucket-&gt;{dirty} = 1;

                if ( $progressbar &amp;&amp; ++$ops_played_back &gt; $progressbar_next_update ) {
                    $progressbar_next_update = $progressbar-&gt;update( $ops_played_back );
                }
            }
            unlink $log_path;
        }

        if ( $progressbar &amp;&amp; $ops_played_back &gt; $progressbar_next_update ) {
            $progressbar-&gt;update( $ops_written );
        }
    }
}

sub _file_iterator {
    my ( $self, $filter ) = @_;
    return File::Next::files( { file_filter =&gt; $filter }, $self-&gt;{work_dir}||$self-&gt;{dir} );
}

=head2 set( $key, $value )

=cut

sub set {
    my ( $self, $key, $value ) = @_;

    my $bucket = $self-&gt;_getBucket( $key );
    $bucket-&gt;{data}{$key} = $value;
    $bucket-&gt;{dirty} = 1;
    $self-&gt;{meta}{last_changed_entry} = $key;

    return $value;
}

sub _make_progressbar {
    my ( $self, $description, $total_count ) = @_;

    return unless $self-&gt;{verbose};

    eval { require Term::ProgressBar };
    return if $@;

    my $progressbar = Term::ProgressBar-&gt;new({ name  =&gt; $description,
                                               count =&gt; $total_count,
                                               ETA   =&gt; 'linear'       });
    $progressbar-&gt;max_update_rate(1);
    return $progressbar;
}


=head2 empty

Clear the store of all contents.  Deletes all files and directories
from the store directory.  
Fix #563, don't destroy workdir, if specified

=cut

sub empty {
    my ( $self ) = @_;
    print "Removing existing contents of target dir $self-&gt;{dir}\n" if $self-&gt;{verbose};
    File::Path::rmtree( $self-&gt;{dir} );
#    File::Path::rmtree( $self-&gt;{work_dir} ) if defined $self-&gt;{work_dir};
    File::Path::mkpath( $self-&gt;{dir} );
    File::Path::mkpath( $self-&gt;{work_dir} ) if defined $self-&gt;{work_dir};
}


########## tied-hash support ########

sub TIEHASH {
    return shift-&gt;open( @_ );
}
sub FETCH {
    return shift-&gt;get(@_);
}
sub STORE {
    return shift-&gt;set(@_);
}
sub DELETE {
    die 'DELETE not implemented';
}
sub CLEAR {
    die 'CLEAR not implemented';
}
sub EXISTS {
    return !! shift-&gt;get(@_);
}
sub FIRSTKEY {
    die 'FIRSTKEY not implemented';
}
sub NEXTKEY {
    die 'NEXTKEY not implemented';
}
sub SCALAR {
    die 'SCALAR not implemented';
}
sub UNTIE {
    die 'UNTIE not implemented';
}

########## helper methods ###########

# cached combination hash and print as hex
sub _hexHash {
    my ( $self, $key ) = @_;
    my $cache = $self-&gt;{hex_hash_cache} ||= $self-&gt;_make_cache( size =&gt; 300 );
    return $cache-&gt;compute( $key, sub {
        my ($k) = @_;
        return $self-&gt;_hex( $self-&gt;_hash( $key ) );
    });
}

sub _hash {
    $_[0]-&gt;{crc}-&gt;strcrc32( $_[1] ) &amp; $_[0]-&gt;{hash_mask}
}

sub _hex {
    sprintf( $_[0]-&gt;{hash_sprintf_pattern}, $_[1] );
}

sub _hexToPath {
    my ( $self, $hex ) = @_;
    my @dir = ( $self-&gt;{dir}, $hex =~ /(.{1,3})/g );
    my @workdir = ( $self-&gt;{work_dir}||$self-&gt;{dir}, $hex =~ /(.{1,3})/g );
    my $file = (pop @dir).$self-&gt;{file_extension};
    my $workfile = (pop @workdir).$self-&gt;{file_extension};
    my $dir = File::Spec-&gt;catdir(@dir);
    my $workdir = File::Spec-&gt;catdir(@workdir);
    #warn "crc: $crc, fullpath: ".File::Spec-&gt;catfile( $dir, $file )."\n";
    return { dir =&gt; $dir, fullpath =&gt; File::Spec-&gt;catfile( $dir, $file ), workdir =&gt; $workdir, workpath =&gt; File::Spec-&gt;catfile( $workdir, $file ) };
}

sub _getBucket {
    my ( $self, $key ) = @_;
    return $self-&gt;_getBucketFromHex( $self-&gt;_hexHash( $key ) );
}

sub _flushAllCaches {
    my ( $self ) = @_;
    delete $self-&gt;{$_} for (
        qw(
              bucket_cache
              bucket_log_filehandle_cache
              hex_hash_cache
              bucket_path_cache_by_hex
          ));
}

sub _getBucketFromHex {
    my ( $self, $hex ) = @_;
    my $bucket_cache = $self-&gt;{bucket_cache} ||= $self-&gt;_make_cache( size =&gt; $self-&gt;{cache_size} );
    return $bucket_cache-&gt;compute( $hex, sub {
        return $self-&gt;_readBucket( $self-&gt;_getBucketPath( $hex ) )
    });
}

sub _getBucketPath {
    my ( $self, $hex ) = @_;
    my $path_cache = $self-&gt;{bucket_path_cache_by_hex} ||= $self-&gt;_make_cache( size =&gt; $self-&gt;{cache_size} );
    return $path_cache-&gt;compute( $hex, sub { $self-&gt;_hexToPath( $hex ) });
}

sub _readBucket {
    my ( $self, $pathinfo ) = @_;

    my $path = $pathinfo-&gt;{fullpath}.( $self-&gt;{compress} ? 'z' : '' );
    my $dir = $pathinfo-&gt;{dir};
    my $gzip = $self-&gt;{compress} ? ':gzip' : '';

    return $bucket_class-&gt;new(
        format =&gt; $self-&gt;{format},
        compress =&gt; $self-&gt;{compress},
        dir =&gt; $dir,
        fullpath =&gt; $path,
        ( -f $path
            ? (
                data =&gt; eval {
                    if ( $self-&gt;{format} eq 'storable' ) {
                        Storable::retrieve( $path )
                      } else {
                          CORE::open my $in, "&lt;$gzip", $path or die "$! reading $path";
                          local $/;
                          JSON::from_json( scalar &lt;$in&gt; )
                        }
                } || {}
              )
            : ( data =&gt; {}, dirty =&gt; 1 )
        ));
}


######## inner class for on-disk hash buckets ##########

package Bio::JBrowse::HashStore::Bucket;

sub new {
    my $class = shift;
    bless { @_ }, $class;
}

# when a bucket is deleted, flush it to disk
sub DESTROY {
    my ( $self ) = @_;

    if( $self-&gt;{dirty} &amp;&amp; %{$self-&gt;{data}} ) {
        File::Path::mkpath( $self-&gt;{dir} ) unless -d $self-&gt;{dir};
        if( $self-&gt;{format} eq 'storable' ) {
            Storable::store( $self-&gt;{data}, $self-&gt;{fullpath} );
        } else {
            my $gzip = $self-&gt;{compress} ? ':gzip' : '';
            my $out = IO::File-&gt;new( $self-&gt;{fullpath}, "&gt;$gzip" )
                or die "$! writing $self-&gt;{fullpath}";
            $out-&gt;print( JSON::to_json( $self-&gt;{data} ) ) or die "$! writing to $self-&gt;{fullpath}";
        }
    }
}


##### inner cache for FIFO caching ###
package Bio::JBrowse::HashStore::FIFOCache;

sub new {
    my $class = shift;
    return bless {
        fifo  =&gt; [],
        bykey =&gt; {},
        size  =&gt; 100,
        @_
    }, $class;
}

sub compute {
    my ( $self, $key, $callback ) = @_;
    return exists $self-&gt;{bykey}{$key} ? $self-&gt;{bykey}{$key} : do {
        my $fifo = $self-&gt;{fifo};
        if( @$fifo &gt;= $self-&gt;{size} ) {
            delete $self-&gt;{bykey}{ shift @$fifo };
        }
        push @$fifo, $key;
        return $self-&gt;{bykey}{$key} = $callback-&gt;($key);
    };
}


1;
</pre></body></html>