Sophie

Sophie

distrib > Fedora > 13 > i386 > by-pkgid > 36617a4edd171540840dbabcaa70bb64 > files > 7

perl-Bio-ASN1-EntrezGene-1.091-9.fc13.noarch.rpm

#!/usr/bin/perl

# launch it like "perl parse_entrez_gene_example.pl Homo_sapiens" (Homo_sapiens can be downloaded
# and decompressed from ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/ASN/Mammalia/Homo_sapiens.gz)
# or use the included test file "perl parse_entrez_gene_example.pl ../t/input.asn"

################################################################################
# parse_entrez_gene_example
# Purpose: Demonstrates how to use Mingyi's Entrez Gene parser and retrieve
#          each data item from Entrez Gene.  This data extraction demo is
#           very important as I spent 3-4 times more time on this script
#           than on writing, debugging, profiling, optimizing my parser!
#           It's a tedious task and I hope this script helps you (I'm sure
#           it will, if you use my parser).
# NOTE!!! This example script shows where each data item from Entrez Gene
#         is in the data structure, but it does not store much data at all!
#         Therefore you will find little data in the dumpValue($gene) gene call
#         That's because data storage is project specific.  Please store
#         the extracted data items in your script according to your own plan.
#
#         Although the author tries to show how to get all data out of Entrez Gene,
#         there is no guarantee that all data from all versions of Entrez Gene will
#         be extracted using this script.
# Copyright: (c) 2005, Mingyi Liu, GPC Biotech, Altana Research Institute.
# License: this code is licensed under Perl itself or GPL.
# Citation: Liu, M and Grigoriev, A (2005) "Fast Parsers for Entrez Gene"
#           Bioinformatics. In press
################################################################################

use strict;
use Dumpvalue;
use Benchmark;
use Bio::ASN1::EntrezGene;

my $parser = Bio::ASN1::EntrezGene->new('file' => $ARGV[0]);
my $i = 0;
while(my $result = $parser->next_seq)
{
  unless(defined $result) # this never happens, but doesn't hurt
  {
    print STDERR "bad text for round #".++$i."!\n";
    next;
  }
  $result = $result->[0] if(ref($result) eq 'ARRAY'); # this should always be true
  Dumpvalue->new->dumpValue($result); # $result contains all Entrez Gene data
  my $gene = makegene($result);
  Dumpvalue->new->dumpValue($gene); # although data are extracted, very few are stored (and thus dumped) after I edited out project-specific stuff, user should decide about the storage themselves
  last;
}

#################################################################################
# NOTE!!! this is just example that shows where each data item from Entrez Gene
#         is in the data structure, therefore I did not store all data items! But
#         for your own script, you'd probably want to store them.
# sorry I don't have time to add more comments! I hope they're easy to understand.
sub makegene
{
  my $seq = shift;
#   Dumpvalue->new->dumpValue($seq);exit;
  my $geneid = safeval($seq, '{track-info}->[0]->{geneid}');
  die "no geneid found!\n" unless $geneid; # this never happens, but doesn't hurt
  my (%protaccs, %protgis);
  my $llgene = {};

  ###################################################################################
  # it is difficult to process Entrez Gene in event-triggered functions, so
  # we'll just process items one by one to pick & choose the ones we want
  safeassign($llgene, 'description', $seq, '{gene}->[0]->{desc}');
  safeassign($llgene, 'type', $seq, '{type}');
  safeassign($llgene, 'symbol', $seq, '{gene}->[0]->{locus}'); # may be overwritten
  map { push(@{$llgene->{genenames}}, $_) } @{$seq->{gene}->[0]->{syn}} if(safeval($seq, '{gene}->[0]->{syn}'));
  $llgene->{summary} = $seq->{summary} if($seq->{summary});
  $llgene->{chromosome} = $seq->{source}->[0]->{subtype}->[0]->{name} if(safeval($seq, '{source}->[0]->{subtype}->[0]->{subtype}') eq 'chromosome');
  safeassign($llgene, 'chrmap', $seq, '{gene}->[0]->{maploc}');
  addxrefs($llgene, 'HGNC', $1) if(safeval($seq, '{gene}->[0]->{locus-tag}') =~ /HGNC:(\S+)/i);

  ##########################################
  # HomoloGene
  if($seq->{homology})
  {
    # Entrez Gene documentation seems to have errors on this one (no label, text, anchor,
    # otherwise we could provide some description)
    # NOTE!!! species could be multiple species separated by ','
    my $id = safeval($seq, '{homology}->[0]->{source}->[0]->{src}->[0]->{tag}->[0]->{id}');
    my $species = safeval($seq, '{homology}->[0]->{heading}');
  }

  ##########################################
  # OK, let's process the comments
  my $allseqs = {}; # refseq seqs
  foreach my $comment (@{$seq->{comments}})
  {
    # pubmed ids
    addxrefs($llgene, 'PUBMED', $comment->{refs}->[0]->{pmid}) if(ref($comment->{refs}) eq 'ARRAY');
    my $status = $comment->{label} if($comment->{heading} eq 'RefSeq Status');
    #####################
    # STS stuff
    if($comment->{heading} =~ /Markers.*STS\)$/)
    {
      foreach my $c (@{$comment->{comment}})
      {
        addxref($llgene, 'UNISTS', id => safeval($c, '{source}->[0]->{anchor}'),
                        acc => safeval($c, '{source}->[0]->{src}->[0]->{tag}->[0]->{id}'));
      }
    }
    ##############################################################
    # refseq stuff, DNA=>protein, domain info, we store it first
    # and assemble info for future processing
    # of variants, transcripts and proteins, (dealing with these objects
    # is by far the most time-consuming task in extracting Entrez Gene data)
    if($comment->{heading} =~ /\(RefSeq\)$/ && $comment->{products})
    {
      foreach my $product (@{$comment->{products}})
      {
        my ($acc, $gi, $trans, %ids);
        # we are probably too careful below since refseq should have accession AND gi
        $acc = $product->{accession};
        $gi = safeval($product, '{source}->[0]->{src}->[0]->{tag}->[0]->{id}');
        if($acc)
        {
          $ids{acc} = $acc;
          $trans->{acc} = $ids{acc};
          $trans->{gi} = $gi if $gi;
          $allseqs->{$acc} = $trans;
          $trans->{type} = $product->{heading} if($product->{type} ne 'mRNA');
        }
        $ids{gi} = $gi if($gi);
        if($product->{comment})
        {
          foreach my $c (@{$product->{comment}})
          {
            # check assembly info
            if($c->{heading} eq 'Source Sequence')
            {
              $trans->{assembly} = safeval($c, '{source}->[0]->{anchor}');
            }
            # check variant info
            if($c->{heading} eq 'Transcriptional Variant')
            {
              $trans->{variantcomment} = safeval($c, '{comment}->[0]->{text}');
            }
          }
        }
        #############################
        # now deal with protein
        if(ref($product->{products}) eq 'ARRAY') # protein
        {
          my ($prot);
          for(my $j = 0; $j < @{$product->{products}}; $j++)
          {
            my $p = $product->{products}->[$j];
            $acc = $p->{accession};
            $gi = safeval($p, '{source}->[0]->{src}->[0]->{tag}->[0]->{id}');
            if($gi)
            {
              $ids{'protgi' . ($j+1)} = $gi;
              $protgis{$gi} = 'protgi' . ($j+1);
            }
            if($acc)
            {
              $ids{'protacc' . ($j+1)} = $acc;
              $protgis{$acc} = 'protacc' . ($j+1);
              $prot->{acc} = $acc;
              $prot->{gi} = $gi if $gi;
              $trans->{protein}->{$acc} = $prot;
              $prot->{type} = $product->{heading} if($p->{type} ne 'peptide');
            }
            safeassign($prot, 'name', $p, '{source}->[0]->{post-text}');
            #############################
            # check domain info
            if($p->{comment})
            {
              foreach my $c1 (@{$p->{comment}})
              {
                if($c1->{heading} =~ /Domains$/)
                {
                  my @domains;
                  foreach my $c2 (@{$c1->{comment}})
                  {
                    my $dom;
                    my $tmp = safeval($c2, '{comment}->[0]->{text}');
                    $dom->{score} = $tmp =~ /Blast Score: ([0-9.-]+)/;
                    ($dom->{start}, $dom->{end}) = $tmp =~ /Location: (\d+) - (\d+)/;
                    $dom->{desc} = safeval($c2, '{source}->[0]->{anchor}');
                    $dom->{xref} = { db  => safeval($c2, '{source}->[0]->{src}->[0]->{db}'),
                                     ids => [{type=>'id',id=>safeval($c2, '{source}->[0]->{src}->[0]->{tag}->[0]->{id}')}]
                                   };
                    push(@domains, $dom);
                  }
                  $prot->{dom} = \@domains;
                }
                elsif($c1->{heading} =~ /\(CCDS\)/) # CCDS database xref
                {
                  $prot->{ccds} = safeval($c1, '{source}->[0]->{src}->[0]->{tag}->[0]->{str}');
                }
              }
            }
            ################
            # end domain
          }
        }
        ######################
        # end protein
        addxref($llgene, 'REFSEQ', %ids) if (keys %ids > 0);
      }
    }
    #########################################################
    # related seqeunces, goes into xref only
    if($comment->{heading} eq 'Related Sequences' && $comment->{products})
    {
      foreach my $product (@{$comment->{products}})
      {
        my ($acc, $gi, %ids);
        $acc = $product->{accession};
        $gi = safeval($product, '{source}->[0]->{src}->[0]->{tag}->[0]->{id}');
        $ids{gi} = $gi if $gi;
        $ids{acc} = $acc if $acc;
        if(ref($product->{products}) eq 'ARRAY')
        {
          # just in case two genbank protein's assigned to one mRNA
          for(my $j = 0; $j < @{$product->{products}}; $j++)
          {
            $acc = $product->{products}->[$j]->{accession};
            $gi = safeval($product->{products}->[$j], '{source}->[0]->{src}->[0]->{tag}->[0]->{id}');
            $ids{'protgi' . ($j+1)} = $gi if $gi;
            $ids{'protacc' . ($j+1)} = $acc if $acc;
          }
        }
        addxref($llgene, 'GENBANK', %ids) if (keys %ids > 0);
      }
    }
    ######################
    # various dblinks
    if($comment->{heading} eq 'Additional Links' && $comment->{comment})
    {
      foreach my $c (@{$comment->{comment}})
      {
        my $id = safeval($c, '{source}->[0]->{src}->[0]->{tag}->[0]->{str}');
        if($id) # add xref to llgene
        {
          my $db = uc(safeval($c, '{source}->[0]->{src}->[0]->{db}'));
          next if $db =~ /\s+/ || $db =~ /HomoloGene/i; # homologene id here is not real id, and some of dblinks' DB name are not real DBs, we temporarily use space to identify those
          my @ids = trim(split /,/, $id); # MGC sometimes has multiple ids concatenated by ','
          map { s/^$db://i; addxref($llgene, $db, 'id' => $_) } @ids; # $db: truncation useful for GDB ids
        }
        my $url = safeval($c, '{source}->[0]->{url}');
        if($url)
        {
          $url =~ s/\s//g; # some urls have linebreaks in them
          my $desc = safeval($c, '{source}->[0]->{anchor}');
        }
      }
    }

    ######################
    # Pathways
    if($comment->{heading} eq 'Pathways' && $comment->{comment})
    {
      foreach my $c (@{$comment->{comment}})
      {
        my $id = safeval($c, '{source}->[0]->{src}->[0]->{tag}->[0]->{str}');
        my $url = safeval($c, '{source}->[0]->{url}');
        if($url)
        {
          $url =~ s/\s//g; # some urls have linebreaks in them
          my $desc = ($c->{text})? $c->{text} : 'KEGG Pathway';
        }
        if($id) # add xref to llgene
        {
          my @ids = trim(split /,/, $id); # MGC sometimes has multiple ids concatenated by ','
          map { addxref($llgene, 'KEGG', 'id' => $_) } @ids; # $db: truncation useful for GDB ids
        }
      }
    }
    ######################
    # generif
    if($comment->{type} eq 'generif')
    {
      my @cs = ($comment->{heading} && ref($comment->{comment}) eq 'ARRAY')? @{$comment->{comment}} : $comment;
      foreach my $c (@cs)
      {
        my $generif = $c->{text};
        my $pmid => safeval($c, "{refs}->[0]->{pmid}");
        if($comment->{heading})
        {
          my $type = $comment->{heading};
          my ($db, $id) = makexref($c);
          my $anchor = safeval($c, '{source}->[0]->{anchor}');
          foreach my $c1 (@{$c->{comment}})
          {
            my ($db, $id) = makexref($c1);
            my ($label, $acc) = ((($c1->{label})? "$c1->{label}:" : ''), $c1->{accession});
          }
        }
        addxref($llgene, 'PUBMED', 'id' => safeval($c, "{refs}->[0]->{pmid}"));
      }
    }

    ######################
    # phenotype
    if($comment->{heading} eq 'Phenotypes')
    {
      my $detail = safeval($comment, '{comment}->[0]->{text}');
      if(safeval($comment, '{comment}->[0]->{source}'))
      {
        my $db = safeval($comment, '{comment}->[0]->{source}->[0]->{src}->[0]->{db}');
        my $id = safeval($comment, '{comment}->[0]->{source}->[0]->{src}->[0]->{tag}->[0]->{id}');
      }
    }

    ######################
    # relationships
    if($comment->{heading} eq 'Relationships')
    {
      foreach my $c (@{$comment->{comment}})
      {
        my $type = ($c->{text} =~ /related (.*)/)? $1 : $c->{text};
        my $anchor = safeval($c, '{source}->[0]->{anchor}');
        my ($db, $id) = makexref($c);
      }
    }

    ######################
    # tRNA
    if($comment->{heading} eq 'tRNA-ext')
    {
      my $trnatext = $comment->{text};
    }

    #########################################################################
    # ECNUM (documentation for Entrez Gene says it's here, so I put this in,
    # but it is actually in locus (see further below)
    if($comment->{type} =~ /property/i && $comment->{label} eq 'EC')
    {
      addxref($llgene, 'EC', 'id' => $comment->{text});
    }
  }

  my %map = ('FUNCTION'  => 'molecular function',
             'PROCESS'   => 'biological process',
             'COMPONENT' => 'cellular component');
  ##################################
  # OK, let's process the GO info
  if($seq->{properties})
  {
    foreach my $p (@{$seq->{properties}})
    {
      if($p->{heading} eq 'GeneOntology')
      {
        foreach my $c (@{$p->{comment}})
        {
          foreach my $c1 (@{$c->{comment}})
          {
            my ($db, $id) = (safeval($c1, '{source}->[0]->{src}->[0]->{db}'),
                             safeval($c1, '{source}->[0]->{src}->[0]->{tag}->[0]->{id}'));
            addxref($llgene, $db, id => $id);
            my $category = $map{uc($c->{label})};
            my $content = safeval($c1, '{source}->[0]->{anchor}');
          }
        }
      }
      elsif($p->{label} eq 'Nomenclature')
      {
        foreach my $p1 (@{$p->{properties}})
        {
          if($p1->{label} eq 'Official Symbol')
          {
            my $hugosymbol = $p1->{text};
          }
          elsif($p1->{label} eq 'Official Full Name')
          {
            my $hugoname = $p1->{text};
          }
        }
      }
    }
  }
  ##################################
  # protein aliases
  if($seq->{prot})
  {
    foreach my $p (@{$seq->{prot}})
    {
      map { my $protalias = $_ } @{$p->{name}} if(ref($p->{name}) eq 'ARRAY');
    }
  }
  #####################################################################
  # now locus, again assemble info into $allseqs for future processing
  # of variants, transcripts and proteins, (dealing with these objects
  # is by far the most time-consuming task in extracting Entrez Gene data)
  if($seq->{locus})
  {
    # judgement call:
    # we should take NC_ whenever possible and disregard NT_
    # but in absence of NC_, we use NT_ to figure out exons
    foreach my $l (@{$seq->{locus}})
    {
      if($l->{products})
      {
        foreach my $p (@{$l->{products}})
        {
          ########################################
          # let's first get the accession numbers
          my %ids;
          $ids{acc} = $p->{accession};
          my $gi = safeval($p, '{seqs}->[0]->{whole}->[0]->{gi}');
          $ids{gi} = $gi if $gi;
          my $t = $allseqs->{$p->{accession}};
          $allseqs->{$p->{accession}}->{acc} = $ids{acc} unless $allseqs->{$p->{accession}}->{acc}; # sometimes NCBI forgot to put refseq IDs into comments about Refseq sequences, e.g. Gene 616.
          $allseqs->{$p->{accession}}->{gi} = $ids{gi} unless $allseqs->{$p->{accession}}->{gi}; # sometimes NCBI forgot to put refseq IDs into comments about Refseq sequences, e.g. Gene 616.
          if($p->{products})
          {
            for(my $j = 0; $j < @{$p->{products}}; $j++)
            {
              my $p1 = $p->{products}->[$j];
              $gi = safeval($p1, '{seqs}->[0]->{whole}->[0]->{gi}');
              my ($gino, $accno) = ((($protgis{$gi})? $protgis{$gi} : 'protgi' . ($j+1)),
                                   (($protaccs{$p1->{accession}})? $protaccs{$p1->{accession}} : 'protacc' . ($j+1)));
              if($gi)
              {
                $ids{$gino} = $gi;
                $protgis{$gi} = $gino;
              }
              $ids{$accno} = $p1->{accession};
              $protaccs{$p1->{accession}} = $accno;
              $allseqs->{$p->{accession}}->{protein}->{$p1->{accession}}->{acc} = $ids{$accno} unless $allseqs->{$p->{accession}}->{protein}->{$p1->{accession}}->{acc}; # sometimes NCBI forgot to put refseq IDs into comments about Refseq sequences, e.g. Gene 616.
              $allseqs->{$p->{accession}}->{protein}->{$p1->{accession}}->{gi} = $ids{$gino} unless $allseqs->{$p->{accession}}->{protein}->{$p1->{accession}}->{gi}; # sometimes NCBI forgot to put refseq IDs into comments about Refseq sequences, e.g. Gene 616.
              if($p1->{comment} && safeval($p1, '{comment}->[0]->{type}') eq 'property' &&
                 safeval($p1, '{comment}->[0]->{label}') eq 'EC')
              {
                my $ec = safeval($p1, '{comment}->[0]->{text}');
                # change dealing with EC number to add to xref
                addxref($llgene, 'EC', 'id' => $ec);
              }
            }
          }
          addxref($llgene, 'REFSEQ', %ids); # trans and prots xrefs

          #############################################################
          # now get exon coordinates - only do the work when necessary
          unless($t && $t->{genomic} && ($t->{genomic} =~ /^NC_/ || $l->{accession} =~ /^NT_/))
          {
            $allseqs->{$p->{accession}}->{genomic} = $l->{accession};
            $allseqs->{$p->{accession}}->{type} = $p->{type};
            my $tmp = safeval($p, '{genomic-coords}->[0]->{mix}->[0]->{int}') ||
                      safeval($p, '{genomic-coords}->[0]->{int}');
            $allseqs->{$p->{accession}}->{exons} = $tmp if($tmp);
            if($p->{products})
            {
              foreach my $p1 (@{$p->{products}})
              {
                #############################################################
                # now get protein location - only do the work when necessary
                $t = $allseqs->{$p->{accession}}->{protein}->{$p1->{accession}};
                unless($t && $t->{from})
                {
                  my $tmp = safeval($p1, '{genomic-coords}->[0]->{packed-int}') ||
                            safeval($p1, '{genomic-coords}->[0]->{int}');
                  if($tmp)
                  {
                    $allseqs->{$p->{accession}}->{protein}->{$p1->{accession}}->{from} = $tmp->[0]->{from};
                    $allseqs->{$p->{accession}}->{protein}->{$p1->{accession}}->{to} = $tmp->[$#$tmp]->{to};
                  }
                }
              }
            }
          }
        }
        addxref($llgene, 'REFSEQ', 'acc' => $l->{accession},
                   'gi' => safeval($l, '{seqs}->[0]->{int}->[0]->{id}->[0]->{gi}')); # trans and prots xrefs
      }
    }
  }
  ##########################################################################
  # now that we got all info for transcripts and proteins
  # we should assemble info into variants, transcripts and proteins,
  # (dealing with these objects is by far the most time-consuming task
  # in extracting Entrez Gene data)
  # note again I edited out project-specific stuff, so the only purpose is
  # to show you where data items are, not to store everything
  my (@variants, @trans, $genestart, $geneend, $genomeacc);
  foreach my $dnaacc (keys %$allseqs)
  {
    my ($variant, $trans, $xref);
    my $t = $allseqs->{$dnaacc};
    if($t->{variantcomment})
    {
      $variant->{comment} = $t->{variantcomment};
    }
    ############################################
    # work on transcript
    $trans->{assembly} = $t->{assembly} if $t->{assembly};
    $trans->{type} = $t->{type} if $t->{type};
    $trans->{comment} = "Data from $t->{genomic}" if($t->{genomic});
    $genomeacc = $t->{genomic} unless $genomeacc;
    my $dealwithtransxref if($t->{acc} || $t->{gi}); # db should be refseq
    # now exon
    if($t->{exons})
    {
      if($t->{genomic} eq $genomeacc) # only process when the trans is on same contig as first one
      {
        $genestart = $t->{exons}->[0]->{from} if(!$genestart || $t->{exons}->[0]->{from} < $genestart);
        $geneend = $t->{exons}->[$#{$t->{exons}}]->{to} if($t->{exons}->[$#{$t->{exons}}]->{to} > $geneend);
      }
      foreach my $exon (@{$t->{exons}})
      {
        $trans->{strand} = $exon->{strand} unless $trans->{strand};
        my $start = $exon->{from}; # follow Entrez Gene style, start is always smaller than end
        my $end = $exon->{to};
        my $coordSys = "$t->{genomic}";
      }
      # finally protein
      if($t->{protein})
      {
        foreach my $pacc (keys %{$t->{protein}})
        {
          my $p = $t->{protein}->{$pacc};
          my $deal_with_prot_xref if($p->{acc}); # db should be refseq
          my $add_ccds_xref if($p->{ccds}); # db should be CCDS
          my $protname = $p->{name} if($p->{name});
          if($p->{from} || $p->{to}) # sometimes Entrez Gene forgets to annotate CDS start/end like for gene 574, NP_001178
          {
            my $protcoordSys = "$t->{genomic}";
            my $protstart = $p->{from} if $p->{from};
            my $protend = $p->{to} if $p->{to};
          }
          # domains
          if($p->{dom})
          {
            foreach my $dom (@{$p->{dom}})
            {
              my $desc = $dom->{desc};
              my $score = $dom->{score} if(defined $dom->{score});
              my $loc = { start => $dom->{start}, end=> $dom->{end}, coordSys => "$p->{acc}" };
              my $xref = $dom->{xref};
            }
          }
        }
      }
    }
    # put transcript into variant if annotated, otherwise into transcript
    if($variant)
    {
      # user can decide how to store
    }
    else
    {
      # user can decide how to store
    }
  }
  if($genestart && $geneend)
  {
    # user can decide how to store
  }

  # tracking info about how this gene has changed
  if(safeval($seq, '{track-info}->[0]->{current-id}'))
  {
    my (@ids, $newegid, $newllid);
    foreach my $id (@{$seq->{'track-info'}->[0]->{'current-id'}})
    {
      my $tmpid = safeval($id, '{tag}->[0]->{id}');
      push(@ids, "$id->{db}:$tmpid");
      $newegid = $tmpid if($id->{db} =~ /^GeneID$/i);
      $newllid = $tmpid if($id->{db} =~ /^LocusID$/i);
    }
    my $comment = "Gene moved: current IDs are: " . join(' ; ', @ids);
  }

  return $llgene;
}

# safely assign a value to $data->{$key} ($data must be hash)
sub safeassign
{
  my ($data, $key, $ds, $str) = @_;
  my $tmp = safeval($ds, $str);
  $data->{$key} = $tmp if $tmp;
  return (defined $tmp)? 1 : 0;
}

# safely extracts a value, another choice is to simply use
# eval in-line, if it fails, it fails.  Probably faster, but can't
# give feedback in-line (always has to add a couple lines dealing with
# $@ for error reporting), might still be worth it though because 
# of the speed.  User can make his/her own choice here.
sub safeval
{
  my ($ds, $str) = @_; # data structure and string (we need $ds passed in because we use strict)
  my @items = split('->', $str);
  foreach (@items)
  {
    my $tmp;
    if(($tmp) = /\[(\d+)\]/)
    {
      return undef unless(ref($ds) eq 'ARRAY' && @$ds > $tmp);
      $ds = $ds->[$tmp];
    }
    elsif(($tmp) = /^{(.*?)}$/)
    {
      return undef unless(ref($ds) eq 'HASH' && $ds->{$tmp}); # this is not ideal (since one might want to return '' instead of undef when this hash value is defined as ''), but correct for our situations
      $ds = $ds->{$tmp};
    }
    else
    {
      die "wrong syntax for string:$str\n";
    }
  }
  return $ds;
}

sub addxrefs
{
  # left for user implementation since it's project-specific
}

sub addxref
{
  # left for user implementation since it's project-specific
}

# used for making xrefs listed under comments
sub makexref
{
  my $c = shift;
  my $db = safeval($c, '{source}->[0]->{src}->[0]->{db}');
  my $id = safeval($c, '{source}->[0]->{src}->[0]->{tag}->[0]->{id}') ||
           safeval($c, '{source}->[0]->{src}->[0]->{tag}->[0]->{str}');
  # change the following as suited for your project
  $id =~ $1 if($id =~ /^"(.*)"$/);
  if($db =~ /GeneID/)
  {
    $db = 'ENTREZGENE';
  }
  elsif($db =~ /Nucleotide/)
  {
    $db = 'GENBANK';
  }
  return ($db, $id);
}

sub trim
{
  my @data = @_;
  map { s/^\s+//; s/\s+$//; } (@data);
  return wantarray ? @data : $data[0];
}