Tripal v1.0 (6.x-1.0)
gff_loader.inc File Reference

Go to the source code of this file.

Functions

 tripal_feature_gff3_load_form ()
 tripal_feature_gff3_load_form_validate ($form, &$form_state)
 tripal_feature_gff3_load_form_submit ($form, &$form_state)
 tripal_feature_load_gff3 ($gff_file, $organism_id, $analysis_id, $add_only=0, $update=0, $refresh=0, $remove=0, $use_transaction=1, $target_organism_id=NULL, $target_type=NULL, $create_target=0, $start_line=1, $landmark_type= '', $alt_id_attr= '', $job=NULL)
 tripal_feature_load_gff3_derives_from ($feature, $subject, $organism)
 tripal_feature_load_gff3_parents ($feature, $cvterm, $parents, $organism_id, $fmin)
 tripal_feature_load_gff3_dbxref ($feature, $dbxrefs)
 tripal_feature_load_gff3_ontology ($feature, $dbxrefs)
 tripal_feature_load_gff3_alias ($feature, $aliases)
 tripal_feature_load_gff3_feature ($organism, $analysis_id, $cvterm, $uniquename, $name, $residues, $is_analysis= 'f', $is_obsolete= 'f', $add_only, $score)
 tripal_feature_load_gff3_featureloc ($feature, $organism, $landmark, $fmin, $fmax, $strand, $phase, $is_fmin_partial, $is_fmax_partial, $residue_info, $locgroup, $landmark_type_id= '', $landmark_organism_id= '', $create_landmark=0, $landmark_is_target=0)
 tripal_feature_load_gff3_property ($feature, $property, $value)
 tripal_feature_load_gff3_fasta ($fh, $interval, &$num_read, &$intv_read, &$line_num)
 tripal_feature_load_gff3_target ($feature, $tags, $target_organism_id, $target_type, $create_target, $attr_locgroup)

Detailed Description

Definition in file gff_loader.inc.


Function Documentation

tripal_feature_load_gff3_fasta ( fh,
interval,
&$  num_read,
&$  intv_read,
&$  line_num 
)

Definition at line 1718 of file gff_loader.inc.

                                                                                             {
  print "Loading FASTA sequences\n";
  $residues = '';
  $sql = " 
    PREPARE sel_gfftemp_un (text) AS
    SELECT feature_id FROM tripal_gff_temp
    WHERE uniquename = $1
  ";
  $status = tripal_core_chado_prepare('sel_gfftemp_un', $sql, array('text'));
  if (!$status) {
   watchdog('T_gff3_loader', 'Cannot prepare statement \'sel_gfftemp_un\'.', 
     array(), WATCHDOG_ERROR);
   return '';  
  }
  $id = NULL;
  
  // iterate through the remaining lines of the file
  while ($line = fgets($fh)) {
    
    $line_num++;
    $size = drupal_strlen($line);   
    $num_read += $size;
    $intv_read += $size; 
    
    $line = trim($line);      
    
    // update the job status every 1% features
    if ($job and $intv_read >= $interval) {
      $intv_read = 0;
      $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
      print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes.\r";
      tripal_job_set_progress($job, intval(($num_read / $filesize) * 100));
    }
    
    // if we encounter a definition line then get the name, uniquename,
    // accession and relationship subject from the definition line
    if (preg_match('/^>/', $line)) {   
      // if we are beginning a new sequence then save the last one we 
      // just finished.     
             
      if ($id) {
        $sql = "EXECUTE sel_gfftemp_un('%s')";
        $result = tripal_core_chado_execute_prepared('sel_gfftemp_un', $sql, array($id));
        if (!$result) {
          watchdog('T_gff3_loader', 'Cannot find feature to assign FASTA sequence: %uname', 
             array('%uname' => $id), WATCHDOG_WARNING); 
        }
        // if we have a feature then add the residues
        else {    
          $feature = db_fetch_object($result);    
          $values = array('residues' => $residues);
          $match = array('feature_id' => $feature->feature_id);
          $options = array('statement_name' => 'upd_feature_re');
          tripal_core_chado_update('feature', $match, $values, $options);
        }
      }
      // get the feature ID for this ID from the tripal_gff_temp table
      $id = preg_replace('/^>(.*)$/', '\1', $line);      
      $residues = '';
    }
    else {
      $residues .= trim($line);
    }
  } 
  // add in the last sequence
  $sql = "EXECUTE sel_gfftemp_un('%s')";
  $result = tripal_core_chado_execute_prepared('sel_gfftemp_un', $sql, array($id));
  if (!$result) {
    watchdog('T_gff3_loader', 'Cannot find feature to assign FASTA sequence: %uname', 
       array('%uname' => $id), WATCHDOG_WARNING); 
  }
  // if we have a feature then add the residues
  else {        
    $feature = db_fetch_object($result);    
    $values = array('residues' => $residues);
    $match = array('feature_id' => $feature->feature_id);
    $options = array('statement_name' => 'upd_feature_re');
    tripal_core_chado_update('feature', $match, $values, $options);
  } 
}
tripal_feature_load_gff3_target ( feature,
tags,
target_organism_id,
target_type,
create_target,
attr_locgroup 
)

Definition at line 1802 of file gff_loader.inc.

                                                                                                                             {
  // format is: "target_id start end [strand]", where strand is optional and may be "+" or "-"
  $matched = preg_match('/^(.*?)\s+(\d+)\s+(\d+)(\s+[\+|\-])*$/', trim($tags['Target'][0]), $matches);
  
  // the organism and type of the target may also be specified as an attribute. If so, then get that
  // information
  $gff_target_organism = array_key_exists('target_organism', $tags) ? $tags['target_organism'][0] : '';
  $gff_target_type = array_key_exists('target_type', $tags) ? $tags['target_type'][0] : '';
  
  // if we have matches and the Target is in the correct format then load the alignment 
  if ($matched) {
    $target_feature = $matches[1]; 
    $start = $matches[2]; 
    $end = $matches[3]; 
    // if we have an optional strand, convert it to a numeric value. 
    if ($matches[4]) {
      if (preg_match('/^\+$/', trim($matches[4]))) {
        $target_strand = 1;
      }
      elseif (preg_match('/^\-$/', trim($matches[4]))) {
        $target_strand = -1;
      }
      else {
        $target_strand = 0;
      }
    }
    else {
       $target_strand = 0;
    }
    
    $target_fmin = $start - 1;
    $target_fmax = $end;
    if ($end < $start) {
      $target_fmin = $end - 1;
      $target_fmax = $start;
    }
    
    // default the target organism to be the value passed into the function, but if the GFF
    // file species the target organism then use that instead.
    $t_organism_id = $target_organism_id;
    if ($gff_target_organism) {
      // get the genus and species
      $success = preg_match('/^(.*?):(.*?)$/', $gff_target_organism, $matches);
      if ($success) {
        $values = array(
          'genus' => $matches[1],
          'species' => $matches[2],
        );
        $options = array('statement_name' => 'sel_organism_gesp');
        $torganism = tripal_core_chado_select('organism', array('organism_id'), $values, $options);
        if (count($torganism) == 1) {
          $t_organism_id = $torganism[0]->organism_id;
        }
        else {
          watchdog('T_gff3_loader', "Cannot find organism for target %target.", 
            array('%target' => $gff_target_organism), WATCHDOG_WARNING);
          $t_organism_id = '';                                   
        }
      }
      else {
        watchdog('T_gff3_loader', "The target_organism attribute is improperly formatted: %target. 
          It should be target_organism=genus:species.", 
          array('%target' => $gff_target_organism), WATCHDOG_WARNING);
        $t_organism_id = '';                
      }
    }  
  
    // default the target type to be the value passed into the function, but if the GFF file
    // species the target type then use that instead
    $t_type_id = '';
    if ($target_type) {
      $values = array(
        'name' => $target_type,
        'cv_id' => array(
           'name' => 'sequence',
        )
      );
      $options = array('statement_name' => 'sel_cvterm_nacv');
      $type = tripal_core_chado_select('cvterm', array('cvterm_id'), $values, $options);
      if (count($type) == 1) {
        $t_type_id = $type[0]->cvterm_id;
      }
      else {
        watchdog('T_gff3_loader', "The target type does not exist in the sequence ontology: %type. ", 
          array('%type' => $target_type), WATCHDOG_ERROR);
        exit;  
      }
    }
    if ($gff_target_type) {
      $values = array(
        'name' => $gff_target_type,
        'cv_id' => array(
           'name' => 'sequence',
        )
      );
      $options = array('statement_name' => 'sel_cvterm_nacv');
      $type = tripal_core_chado_select('cvterm', array('cvterm_id'), $values, $options);
      if (count($type) == 1) {
        $t_type_id = $type[0]->cvterm_id;
      }
      else {
        watchdog('T_gff3_loader', "The target_type attribute does not exist in the sequence ontology: %type. ", 
          array('%type' => $gff_target_type), WATCHDOG_WARNING);
        $t_type_id = '';
      }
    }                       
    
    // we want to add a featureloc record that uses the target feature as the srcfeature (landmark)
    // and the landmark as the feature.
    tripal_feature_load_gff3_featureloc($feature, $organism, $target_feature, $target_fmin, 
      $target_fmax, $target_strand, $phase, $attr_fmin_partial, $attr_fmax_partial, $attr_residue_info, 
      $attr_locgroup, $t_type_id, $t_organism_id, $create_target, TRUE); 
  }
  // the target attribute is not correctly formatted
  else {
    watchdog('T_gff3_loader', "Could not add 'Target' alignment as it is improperly formatted:  '%target'",
      array('%target' => $tags['Target'][0]), WATCHDOG_ERROR);            
  }
}
 All Classes Files Functions Variables