Tripal v1.0 (6.x-1.0)
gff_loader.inc
Go to the documentation of this file.
00001 <?php
00021 function tripal_feature_gff3_load_form() {
00022 
00023   $form['gff_file']= array(
00024     '#type'          => 'textfield',
00025     '#title'         => t('GFF3 File'),
00026     '#description'   => t('Please enter the full system path for the GFF file, or a path within the Drupal
00027                            installation (e.g. /sites/default/files/xyz.gff).  The path must be accessible to the
00028                            server on which this Drupal instance is running.'),
00029     '#required' => TRUE,
00030   );
00031   // get the list of organisms
00032   $sql = "SELECT * FROM {organism} ORDER BY genus, species";
00033   $org_rset = chado_query($sql);
00034   $organisms = array();
00035   $organisms[''] = '';
00036   while ($organism = db_fetch_object($org_rset)) {
00037     $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
00038   }
00039   $form['organism_id'] = array(
00040     '#title'       => t('Organism'),
00041     '#type'        => t('select'),
00042     '#description' => t("Choose the organism to which these sequences are associated"),
00043     '#required'    => TRUE,
00044     '#options'     => $organisms,
00045   );
00046   
00047   // get the list of analyses
00048   $sql = "SELECT * FROM {analysis} ORDER BY name";
00049   $org_rset = chado_query($sql);
00050   $analyses = array();
00051   $analyses[''] = '';
00052   while ($analysis = db_fetch_object($org_rset)) {
00053     $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)";
00054   }
00055   $form['analysis_id'] = array(
00056    '#title'       => t('Analysis'),
00057    '#type'        => t('select'),
00058    '#description' => t("Choose the analysis to which these features are associated. 
00059        Why specify an analysis for a data load?  All data comes
00060        from some place, even if downloaded from Genbank. By specifying
00061        analysis details for all data imports it allows an end user to reproduce the
00062        data set, but at least indicates the source of the data."),
00063    '#required'    => TRUE,
00064    '#options'     => $analyses,
00065   );
00066     
00067   $form['line_number']= array(
00068     '#type'          => 'textfield',
00069     '#title'         => t('Start Line Number'),
00070     '#description'   => t('Enter the line number in the GFF file where you would like to begin processing.  The 
00071       first line is line number 1.  This option is useful for examining loading problems with large GFF files.'),
00072     '#size' => 10,
00073   );
00074   
00075   $form['landmark_type'] = array(
00076     '#title'       => t('Landmark Type'),
00077     '#type'        => t('textfield'),
00078     '#description' => t("Optional. Use this field to specify a Sequence Ontology type
00079        for the landmark sequences in the GFF fie (e.g. 'chromosome'). If the GFF file 
00080        contains a '##sequence-region' line that describes the landmark sequences to 
00081        which all others are aligned and a type is provided here then the features
00082        will be created if they do not already exist.  If they do exist then this
00083        field is not used."),
00084   ); 
00085 
00086   $form['alt_id_attr'] = array(
00087     '#title'       => t('ID Attribute'),
00088     '#type'        => t('textfield'),
00089     '#description' => t("Optional. Sometimes lines in the GFF file are missing the 
00090       required ID attribute that specifies the unique name of the feature.  If so, 
00091       you may specify an the name of an existing  attribute to use for the name."),
00092   );
00093   
00094   $form['import_options'] = array(
00095     '#type' => 'fieldset',
00096     '#title' => t('Import Options'),
00097     '#collapsed' => TRUE
00098   );
00099     
00100   $form['import_options']['use_transaction']= array(
00101     '#type' => 'checkbox',
00102     '#title' => t('Use a transaction'),
00103     '#required' => FALSE,
00104     '#description' => t('Use a database transaction when loading the GFF file.  If an error occurs 
00105       the entire datset loaded prior to the failure will be rolled back and will not be available
00106       in the database.  If this option is unchecked and failure occurs all records up to the point
00107       of failure will be present in the database.'),
00108     '#default_value' => 1,
00109   );
00110   $form['import_options']['add_only']= array(
00111     '#type' => 'checkbox',
00112     '#title' => t('Import only new features'),
00113     '#required' => FALSE,
00114     '#description' => t('The job will skip features in the GFF file that already
00115                          exist in the database and import only new features.'),
00116   );
00117   $form['import_options']['update']= array(
00118     '#type' => 'checkbox',
00119     '#title' => t('Import all and update'),
00120     '#required' => FALSE,
00121     '#default_value' => 'checked',
00122     '#description' => t('Existing features will be updated and new features will be added.  Attributes
00123                          for a feature that are not present in the GFF but which are present in the
00124                          database will not be altered.'),
00125   );
00126   $form['import_options']['refresh']= array(
00127     '#type' => 'checkbox',
00128     '#title' => t('Import all and replace'),
00129     '#required' => FALSE,
00130     '#description' => t('Existing features will be updated and feature properties not
00131                          present in the GFF file will be removed.'),
00132   );
00133   $form['import_options']['remove']= array(
00134     '#type' => 'checkbox',
00135     '#title' => t('Delete features'),
00136     '#required' => FALSE,
00137     '#description' => t('Features present in the GFF file that exist in the database
00138                          will be removed rather than imported'),
00139   );
00140 
00141   $form['targets'] = array(
00142     '#type' => 'fieldset',
00143     '#title' => t('Targets'),
00144     '#collapsed' => TRUE
00145   );
00146   $form['targets']['adesc'] = array(
00147     '#type' => 'markup',
00148     '#value' => t("When alignments are represented in the GFF file (e.g. such as 
00149        alignments of cDNA sequences to a whole genome, or blast matches), they are
00150        represented using two feature types: 'match' (or cDNA_match, EST_match, etc.) 
00151        and 'match_part'.  These features may also have a 'Target' attribute to
00152        specify the sequence that is being aligned.  
00153        However, the organism to which the aligned sequence belongs may not be present in the
00154        GFF file.  Here you can specify the organism and feature type of the target sequences.
00155        The options here will apply to all targets unless the organism and type are explicity
00156        set in the GFF file using the 'target_organism' and 'target_type' attributes."),
00157   );
00158   $form['targets']['target_organism_id'] = array(
00159     '#title'       => t('Target Organism'),
00160     '#type'        => t('select'),
00161     '#description' => t("Optional. Choose the organism to which target sequences belong. 
00162       Select this only if target sequences belong to a different organism than the 
00163       one specified above. And only choose an organism here if all of the target sequences 
00164       belong to the same species.  If the targets in the GFF file belong to multiple 
00165       different species then the organism must be specified using the 'target_organism=genus:species' 
00166       attribute in the GFF file."),
00167     '#options'     => $organisms,
00168   );
00169   $form['targets']['target_type'] = array(
00170     '#title'       => t('Target Type'),
00171     '#type'        => t('textfield'),
00172     '#description' => t("Optional. If the unique name for a target sequence is not unique (e.g. a protein
00173        and an mRNA have the same name) then you must specify the type for all targets in the GFF file. If 
00174        the targets are of different types then the type must be specified using the 'target_type=type' attribute
00175        in the GFF file. This must be a valid Sequence Ontology (SO) term."),
00176   );
00177   $form['targets']['create_target']= array(
00178     '#type' => 'checkbox',
00179     '#title' => t('Create Target'),
00180     '#required' => FALSE,
00181     '#description' => t("If the target feature cannot be found, create one using the organism and type specified above, or
00182        using the 'target_organism' and 'target_type' fields specified in the GFF file.  Values specified in the
00183        GFF file take precedence over those specified above."),
00184   );
00185 
00186   $form['button'] = array(
00187     '#type' => 'submit',
00188     '#value' => t('Import GFF3 file'),
00189     '#weight' => 10,
00190   );
00191 
00192   return $form;
00193 }
00194 
00200 function tripal_feature_gff3_load_form_validate($form, &$form_state) {
00201 
00202   $gff_file = trim($form_state['values']['gff_file']);
00203   $organism_id = $form_state['values']['organism_id'];
00204   $target_organism_id = $form_state['values']['target_organism_id'];
00205   $target_type = trim($form_state['values']['target_type']);
00206   $create_target = $form_state['values']['create_target'];
00207   $add_only = $form_state['values']['add_only'];
00208   $update   = $form_state['values']['update'];
00209   $refresh  = $form_state['values']['refresh'];
00210   $remove   = $form_state['values']['remove'];
00211   $use_transaction   = $form_state['values']['use_transaction'];
00212   $line_number   = trim($form_state['values']['line_number']);
00213   $landmark_type   = trim($form_state['values']['landmark_type']);
00214   $alt_id_attr   = trim($form_state['values']['alt_id_attr']);
00215   
00216   
00217 
00218   // check to see if the file is located local to Drupal
00219   $gff_file = trim($gff_file);
00220   $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $gff_file;
00221   if (!file_exists($dfile)) {
00222     // if not local to Drupal, the file must be someplace else, just use
00223     // the full path provided
00224     $dfile = $gff_file;
00225   }
00226   if (!file_exists($dfile)) {
00227     form_set_error('gff_file', t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
00228   }
00229 
00230   // @coder-ignore: there are no functions being called here
00231   if (($add_only AND ($update   OR $refresh  OR $remove)) OR
00232       ($update   AND ($add_only OR $refresh  OR $remove)) OR
00233       ($refresh  AND ($update   OR $add_only OR $remove)) OR
00234       ($remove   AND ($update   OR $refresh  OR $add_only))) {
00235     form_set_error('add_only', t("Please select only one checkbox from the import options section"));
00236   }
00237   
00238   if ($line_number and !is_numeric($line_number) or $line_number < 0) {
00239     form_set_error('line_number', t("Please provide an integer line number greater than zero."));
00240   }   
00241 }
00242 
00247 function tripal_feature_gff3_load_form_submit($form, &$form_state) {
00248   global $user;
00249 
00250   $gff_file = trim($form_state['values']['gff_file']);
00251   $organism_id = $form_state['values']['organism_id'];
00252   $add_only = $form_state['values']['add_only'];
00253   $update   = $form_state['values']['update'];
00254   $refresh  = $form_state['values']['refresh'];
00255   $remove   = $form_state['values']['remove'];
00256   $analysis_id = $form_state['values']['analysis_id'];
00257   $use_transaction   = $form_state['values']['use_transaction'];
00258   $target_organism_id = $form_state['values']['target_organism_id'];
00259   $target_type = trim($form_state['values']['target_type']);
00260   $create_target = $form_state['values']['create_target'];
00261   $line_number   = trim($form_state['values']['line_number']);
00262   $landmark_type   = trim($form_state['values']['landmark_type']);
00263   $alt_id_attr   = trim($form_state['values']['alt_id_attr']);
00264     
00265   $args = array($gff_file, $organism_id, $analysis_id, $add_only, 
00266     $update, $refresh, $remove, $use_transaction, $target_organism_id, 
00267     $target_type, $create_target, $line_number, $landmark_type, $alt_id_attr);
00268     
00269   $type = '';
00270   if ($add_only) {
00271     $type = 'import only new features';
00272   }
00273   if ($update) {
00274     $type = 'import all and update';
00275   }
00276   if ($refresh) {
00277     $type = 'import all and replace';
00278   }
00279   if ($remove) {
00280     $type = 'delete features';
00281   }
00282   $fname = preg_replace("/.*\/(.*)/", "$1", $gff_file);
00283   tripal_add_job("$type GFF3 file: $fname", 'tripal_feature',
00284     'tripal_feature_load_gff3', $args, $user->uid);
00285 
00286   return '';
00287 }
00288 
00294 function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id, 
00295   $add_only =0, $update = 0, $refresh = 0, $remove = 0, $use_transaction = 1, 
00296   $target_organism_id = NULL, $target_type = NULL,  $create_target = 0, 
00297   $start_line = 1, $landmark_type = '', $alt_id_attr = '', $job = NULL) {     
00298 
00299   // make sure our temporary table exists
00300   $ret = array(); 
00301   if (!db_table_exists('tripal_gff_temp')) { 
00302     $schema = tripal_feature_get_custom_tables('tripal_gff_temp');  
00303     $success = tripal_core_create_custom_table($ret, 'tripal_gff_temp', $schema['tripal_gff_temp']);
00304     if (!$success) {
00305       watchdog('T_gff3_loader', "Cannot create temporary loading table", array(), WATCHDOG_ERROR); 
00306       return;
00307     } 
00308   }
00309   // empty the temp table
00310   $sql = "DELETE FROM tripal_gff_temp";
00311   chado_query($sql);
00312 
00313   // get a persistent connection
00314   $connection = tripal_db_persistent_chado();
00315   if (!$connection) {
00316      print "A persistant connection was not obtained. Loading will be slow\n";
00317   }
00318   
00319   // begin the transaction
00320   if ($use_transaction) {
00321     tripal_db_start_transaction();
00322         
00323     // if we cannot get a connection then let the user know the loading will be slow
00324     if (!$connection) {
00325        print "A persistant connection was not obtained. Loading will be slow\n";
00326     }
00327     else {
00328        print "\nNOTE: Loading of this GFF file is performed using a database transaction. \n" .
00329              "If the load fails or is terminated prematurely then the entire set of \n" .
00330              "insertions/updates is rolled back and will not be found in the database\n\n";
00331     }
00332   }
00333 
00334   // check to see if the file is located local to Drupal
00335   $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $gff_file;
00336   if (!file_exists($dfile)) {
00337     // if not local to Drupal, the file must be someplace else, just use
00338     // the full path provided
00339     $dfile = $gff_file;
00340   }
00341   if (!file_exists($dfile)) {
00342     watchdog('T_gff3_loader', "Cannot find the file: %dfile", 
00343       array('%dfile' => $dfile), WATCHDOG_ERROR);
00344     return 0;
00345   }
00346 
00347   print "Opening $gff_file\n";
00348 
00349   //$lines = file($dfile,FILE_SKIP_EMPTY_LINES);
00350   $fh = fopen($dfile, 'r');
00351   if (!$fh) {
00352     watchdog('T_gff3_loader', "cannot open file: %dfile", 
00353       array('%dfile' => $dfile), WATCHDOG_ERROR);
00354     return 0;
00355   }
00356   $filesize = filesize($dfile);
00357 
00358   // get the controlled vocaubulary that we'll be using.  The
00359   // default is the 'sequence' ontology
00360   // @coder-ignore: non-drupal schema thus table prefixing does not apply
00361   $sql = "SELECT * FROM cv WHERE name = '%s'";
00362   $cv = db_fetch_object(chado_query($sql, 'sequence'));
00363   if (!$cv) {   
00364     watchdog('T_gff3_loader', "Cannot find the 'sequence' ontology", 
00365       array(), WATCHDOG_ERROR);
00366     return '';
00367   }
00368 
00369   // get the organism for which this GFF3 file belongs
00370   // @coder-ignore: non-drupal schema thus table prefixing does not apply
00371   $sql = "SELECT * FROM organism WHERE organism_id = %d";
00372   $organism = db_fetch_object(chado_query($sql, $organism_id));
00373 
00374   $interval = intval($filesize * 0.0001);
00375   if ($interval == 0) {
00376     $interval = 1;
00377   }
00378   $in_fasta = 0;
00379   $line_num = 0;
00380   $num_read = 0;
00381   $intv_read = 0;
00382   
00383   // prepare the statement used to get the cvterm for each feature.
00384   if (!tripal_core_is_sql_prepared('sel_cvterm_idnasy')) {
00385     $psql = "PREPARE sel_cvterm_idnasy (int, text, text) AS
00386              SELECT CVT.cvterm_id, CVT.cv_id, CVT.name, CVT.definition,
00387                 CVT.dbxref_id, CVT.is_obsolete, CVT.is_relationshiptype
00388              FROM {cvterm} CVT
00389                 INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
00390                 LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
00391              WHERE CV.cv_id = $1 and 
00392                (lower(CVT.name) = lower($2) or lower(CVTS.synonym) = lower($3))";
00393      $status = tripal_core_chado_prepare('sel_cvterm_idnasy', $psql, array('int','text','text'));
00394      if (!$status) {
00395        watchdog('T_gff3_loader', 'cannot prepare statement \'sel_cvterm_idnasy\'.', 
00396          array(), WATCHDOG_ERROR);
00397        return '';
00398        
00399      }  
00400   } 
00401 
00402   // iterate through each line of the GFF file
00403   print "Parsing Line $line_num (0.00%). Memory: " . number_format(memory_get_usage()) . " bytes\r";
00404   while ($line = fgets($fh)) {
00405     $line_num++;
00406     $size = drupal_strlen($line);
00407     $num_read += $size;
00408     $intv_read += $size; 
00409     
00410     if($line_num < $start_line) {
00411       continue;
00412     }    
00413     
00414     // update the job status every 1% features
00415     if ($job and $intv_read >= $interval) {
00416       $intv_read = 0;
00417       $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
00418       print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes.\r";
00419       tripal_job_set_progress($job, intval(($num_read / $filesize) * 100));
00420     }
00421       
00422     // check to see if we have FASTA section, if so then set the variable
00423     // to start parsing
00424     if (preg_match('/^##FASTA/i', $line)) {
00425       if($remove) {
00426         // we're done because this is a delete operation so break out of the loop.
00427         break;         
00428       }
00429       tripal_feature_load_gff3_fasta($fh, $interval, $num_read, $intv_read, $line_num);
00430       continue;
00431     }
00432     // if the ##sequence-region line is present then we want to add a new feature
00433     if (preg_match('/^##sequence-region (.*?) (\d+) (\d+)$/i', $line, $region_matches)) {
00434       $rid = $region_matches[1];
00435       $rstart = $region_matches[2];
00436       $rend = $region_matches[3];
00437       if ($landmark_type) {
00438         $result = chado_query("EXECUTE sel_cvterm_idnasy (%d, '%s', '%s')", $cv->cv_id, $landmark_type, $landmark_type);
00439         $cvterm = db_fetch_object($result);
00440         if (!$cvterm) {
00441           watchdog('T_gff3_loader', 'cannot find feature term \'%landmark_type\' on line %line_num of the GFF file', 
00442             array('%landmark_type' => $landmark_type, '%line_num' => $line_num), WATCHDOG_ERROR);
00443           return '';
00444         }
00445         tripal_feature_load_gff3_feature($organism, $analysis_id, $cvterm, $rid,
00446           $rid, '', 'f', 'f', 1, 0);
00447       }
00448       continue;
00449     }
00450     
00451     // skip comments
00452     if (preg_match('/^#/', $line)) {
00453       continue;
00454     }
00455     
00456     // skip empty lines
00457     if (preg_match('/^\s*$/', $line)) {
00458       continue;
00459     }    
00460 
00461     // get the columns
00462     $cols = explode("\t", $line);
00463     if (sizeof($cols) != 9) {
00464       watchdog('T_gff3_loader', 'improper number of columns on line %line_num', 
00465         array('%line_num' => $line_num), WATCHDOG_ERROR);
00466       return '';
00467     }
00468     
00469     // get the column values
00470     $landmark = $cols[0];
00471     $source   = $cols[1];
00472     $type     = $cols[2];
00473     $start    = $cols[3];
00474     $end      = $cols[4];
00475     $score    = $cols[5];
00476     $strand   = $cols[6];
00477     $phase    = $cols[7];
00478     $attrs    = explode(";", $cols[8]);  // split by a semicolon 
00479     
00480     // ready the start and stop for chado.  Chado expects these positions
00481     // to be zero-based, so we substract 1 from the fmin
00482     $fmin = $start - 1;
00483     $fmax = $end;
00484     if ($end < $start) {
00485       $fmin = $end - 1;
00486       $fmax = $start;
00487     }
00488     
00489     // format the strand for chado
00490     if (strcmp($strand, '.') == 0) {
00491       $strand = 0;
00492     }
00493     elseif (strcmp($strand, '+') == 0) {
00494       $strand = 1;
00495     }
00496     elseif (strcmp($strand, '-') == 0) {
00497       $strand = -1;
00498     }
00499     if (strcmp($phase, '.') == 0) {
00500       $phase = '';
00501     }
00502   
00503     $result = chado_query("EXECUTE sel_cvterm_idnasy (%d, '%s', '%s')", $cv->cv_id, $type, $type);
00504 
00505     $cvterm = db_fetch_object($result);
00506     if (!$cvterm) {
00507       watchdog('T_gff3_loader', 'cannot find feature term \'%type\' on line %line_num of the GFF file', 
00508         array('%type' => $type, '%line_num' => $line_num), WATCHDOG_ERROR);
00509       return '';
00510     }
00511      
00512     // break apart each of the attributes
00513     $tags = array();
00514     $attr_name = '';
00515     $attr_uniquename = '';
00516     $attr_residue_info = '';
00517     $attr_locgroup = 0;
00518     $attr_fmin_partial = 'f';
00519     $attr_fmax_partial = 'f';
00520     $attr_is_obsolete = 'f';
00521     $attr_is_analysis = 'f';
00522     $attr_others = '';
00523     $residues = '';
00524 
00525     foreach ($attrs as $attr) {
00526       $attr = rtrim($attr);
00527       $attr = ltrim($attr);
00528       if (strcmp($attr, '')==0) {
00529         continue;
00530       }
00531       if (!preg_match('/^[^\=]+\=.+$/', $attr)) {
00532         watchdog('T_gff3_loader', 'Attribute is not correctly formatted on line %line_num: %attr', 
00533           array('%line_num' => $line_num, '%attr' => $attr), WATCHDOG_ERROR);
00534         return '';
00535       }
00536 
00537       // break apart each tag
00538       $tag = preg_split("/=/", $attr, 2);  // split by equals sign
00539       
00540       // multiple instances of an attribute are separated by commas
00541       $tag_name = $tag[0];
00542       if (!array_key_exists($tag_name, $tags)) {
00543         $tags[$tag_name] = array();
00544       }
00545       $tags[$tag_name] = array_merge($tags[$tag_name], explode(",", $tag[1]));  // split by comma
00546       
00547       
00548       // replace the URL escape codes for each tag
00549       for ($i = 0; $i < count($tags[$tag_name]); $i++) {
00550         $tags[$tag_name][$i] = urldecode($tags[$tag_name][$i]);                  
00551       }
00552       
00553       // get the name and ID tags
00554       if (strcmp($tag_name, 'ID') == 0) {
00555         $attr_uniquename =  urldecode($tag[1]);
00556       }
00557       elseif (strcmp($tag_name, 'Name') == 0) {
00558         $attr_name =  urldecode($tag[1]);
00559       }
00560       // get the list of non-reserved attributes
00561       elseif (strcmp($tag_name, 'Alias') !=0        and strcmp($tag_name, 'Parent') !=0 and
00562               strcmp($tag_name, 'Target') !=0       and strcmp($tag_name, 'Gap') !=0 and
00563               strcmp($tag_name, 'Derives_from') !=0 and strcmp($tag_name, 'Note') !=0 and
00564               strcmp($tag_name, 'Dbxref') !=0       and strcmp($tag_name, 'Ontology_term') !=0 and
00565               strcmp($tag_name, 'Is_circular') !=0  and strcmp($tag_name, 'target_organism') !=0 and
00566               strcmp($tag_name, 'target_type') != 0) {
00567         foreach ($tags[$tag_name] as $value) {
00568           $attr_others[$tag_name][] = $value;
00569         }
00570       }
00571     }
00572 
00573     // if neither name nor uniquename are provided then generate one
00574     if (!$attr_uniquename and !$attr_name) {
00575       // check if an alternate ID field is suggested, if so, then use
00576       // that for the name
00577       if (array_key_exists($alt_id_attr, $tags)) {
00578         $attr_uniquename = $tags[$alt_id_attr][0];
00579         $attr_name = $attr_uniquename;  
00580       }
00581       // if the row has a parent then generate a uniquename using the parent name
00582       elseif (array_key_exists('Parent', $tags)) {
00583         $attr_uniquename = $tags['Parent'][0] . "-$type-$landmark:$fmin..$fmax";
00584         $attr_name = $attr_uniquename;
00585       }
00586       // generate a unique name based on the date, type and location
00587       // and set the name to simply be the type
00588       else {
00589         $date = getdate();
00590         $attr_uniquename = $date[0] . "-$type-$landmark:$fmin..$fmax";
00591         $attr_name = $type;
00592       }      
00593     }
00594 
00595     // if a name is not specified then use the unique name
00596     if (strcmp($attr_name, '')==0) {
00597       $attr_name = $attr_uniquename;
00598     }
00599 
00600     // if an ID attribute is not specified then use the attribute name and
00601     // hope for the best
00602     if (!$attr_uniquename) {
00603       $attr_uniquename = $attr_name;
00604     }
00605 
00606     // make sure the landmark sequence exists in the database.  We don't
00607     // know the type of the landmark so we'll hope that it's unique across
00608     // all types. If not we'll error out.  This test is only necessary if
00609     // if the landmark and the uniquename are different.  If they are the same
00610     // then this is the information for the landmark
00611     if (!$remove and strcmp($landmark, $attr_uniquename) != 0 ) {
00612       $select = array(
00613          'organism_id' => $organism_id,
00614          'uniquename'  => $landmark,
00615       );
00616       $columns = array('count(*) as num_landmarks');
00617       $options = array('statement_name' => 'sel_feature_numland');      
00618       $count = tripal_core_chado_select('feature', $columns, $select, $options);   
00619       if (!$count or count($count) == 0 or $count[0]->num_landmarks == 0) {
00620         watchdog('T_gff3_loader', "The landmark '%landmark' cannot be found for this organism (" . $organism->genus . " " . $organism->species . ") " .
00621               "Please add the landmark and then retry the import of this GFF3 ".
00622               "file", array('%landmark' => $landmark), WATCHDOG_ERROR);
00623         return '';
00624 
00625       }
00626       if ($count[0]->num_landmarks > 1) {
00627         watchdog('T_gff3_loader', "The landmark '%landmark' is not unique for this organism. ".
00628               "The features cannot be associated", array('%landmark' => $landmark), WATCHDOG_ERROR);
00629         return '';
00630       }  
00631     }
00632     
00633     // if the option is to remove or refresh then we want to remove
00634     // the feature from the database.
00635     if ($remove or $refresh) {
00636       $sql = "DELETE FROM {feature}
00637               WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
00638       $match = array(
00639          'organism_id' => $organism->organism_id,
00640          'uniquename'  => $attr_uniquename,
00641          'type_id'     => $cvterm->cvterm_id
00642       );
00643       $result = tripal_core_chado_delete('feature', $match);
00644       if (!$result) {
00645         watchdog('T_gff3_loader', "cannot delete feature %attr_uniquename", 
00646           array('%attr_uniquename' => $attr_uniquename), WATCHDOG_ERROR);
00647       }
00648       $feature = 0;
00649       unset($result);
00650     }
00651 
00652     // add or update the feature and all properties
00653     if ($update or $refresh or $add_only) {
00654 
00655       // add/update the feature
00656       $feature = tripal_feature_load_gff3_feature($organism, $analysis_id, $cvterm,
00657         $attr_uniquename, $attr_name, $residues, $attr_is_analysis,
00658         $attr_is_obsolete, $add_only, $score);  
00659    
00660       if ($feature) {
00661         
00662         // add a record for this feature to the tripal_gff_temp table for
00663         // later lookup
00664         $values = array(
00665           'feature_id' => $feature->feature_id,
00666           'organism_id' => $feature->organism_id,
00667           'type_name' => $type,
00668           'uniquename' => $feature->uniquename
00669         );
00670         // make sure this record doesn't already exist in oru temp table
00671         $options = array('statement_name' => 'sel_tripalgfftemp_all');
00672         $results = tripal_core_chado_select('tripal_gff_temp', array('*'), $values, $options);
00673 
00674         if (count($results) == 0) {
00675           $options = array('statement_name' => 'ins_tripalgfftemp');
00676           $result = tripal_core_chado_insert('tripal_gff_temp', $values, $options);
00677           if (!$result) {
00678             watchdog('T_gff3_loader', "Cound not save record in temporary table, Cannot continue.", array(), WATCHDOG_ERROR);
00679             exit;
00680           }
00681         }
00682 
00683         // add/update the featureloc if the landmark and the ID are not the same
00684         // if they are the same then this entry in the GFF is probably a landmark identifier
00685         if (strcmp($landmark, $attr_uniquename) !=0 ) {
00686           tripal_feature_load_gff3_featureloc($feature, $organism,
00687             $landmark, $fmin, $fmax, $strand, $phase, $attr_fmin_partial,
00688             $attr_fmax_partial, $attr_residue_info, $attr_locgroup);
00689         }
00690         // add any aliases for this feature
00691         if (array_key_exists('Alias', $tags)) {
00692           tripal_feature_load_gff3_alias($feature, $tags['Alias']);
00693         }      
00694         // add any dbxrefs for this feature
00695         if (array_key_exists('Dbxref', $tags)) {
00696           tripal_feature_load_gff3_dbxref($feature, $tags['Dbxref']);
00697         }
00698         // add any ontology terms for this feature
00699         if (array_key_exists('Ontology_term', $tags)) {
00700           tripal_feature_load_gff3_ontology($feature, $tags['Ontology_term']);
00701         }       
00702         // add parent relationships
00703         if (array_key_exists('Parent', $tags)) {
00704           tripal_feature_load_gff3_parents($feature, $cvterm, $tags['Parent'], $organism_id, $fmin);
00705         }               
00706         // add target relationships
00707         if (array_key_exists('Target', $tags)) {
00708           tripal_feature_load_gff3_target($feature, $tags, $target_organism_id, $target_type, $create_target, $attr_locgroup);          
00709         }
00710         // add gap information.  This goes in simply as a property
00711         if (array_key_exists('Gap', $tags)) {
00712           foreach ($tags['Gap'] as $value) {
00713             tripal_feature_load_gff3_property($feature, 'Gap', $value);
00714           }
00715         }
00716         // add notes. This goes in simply as a property
00717         if (array_key_exists('Note', $tags)) {
00718           foreach ($tags['Note'] as $value) {
00719               tripal_feature_load_gff3_property($feature, 'Note', $value);
00720           }
00721         }
00722         // add the Derives_from relationship (e.g. polycistronic genes).
00723         if (array_key_exists('Derives_from', $tags)) {
00724           tripal_feature_load_gff3_derives_from($feature, $tags['Derives_from'][0], $organism);
00725         }
00726         // add in the GFF3_source dbxref so that GBrowse can find the feature using the source column
00727         $source_ref = array('GFF_source:' . $source);
00728         tripal_feature_load_gff3_dbxref($feature, $source_ref);
00729         // add any additional attributes
00730         if ($attr_others) {
00731           foreach ($attr_others as $tag_name => $values) {
00732             foreach ($values as $value) {
00733               tripal_feature_load_gff3_property($feature, $tag_name, $value);
00734             }
00735           }
00736         }
00737         
00738       }
00739     }
00740   }
00741 
00742   if (!$remove) {
00743     print "\nSetting ranks of children...\n";
00744     
00745     // get features in a relationship that are also children of an alignment
00746     $sql = "SELECT DISTINCT F.feature_id, F.organism_id, F.type_id, 
00747               F.uniquename, FL.strand 
00748             FROM tripal_gff_temp TGT 
00749               INNER JOIN feature F                ON TGT.feature_id = F.feature_id
00750               INNER JOIN feature_relationship FR  ON FR.object_id = TGT.feature_id
00751               INNER JOIN cvterm CVT               ON CVT.cvterm_id = FR.type_id  
00752               INNER JOIN featureloc FL            ON FL.feature_id = F.feature_id    
00753             WHERE CVT.name = 'part_of'";
00754     $parents = chado_query($sql);
00755     
00756     // build and prepare the SQL for selecting the children relationship
00757     $sql = "SELECT DISTINCT FR.feature_relationship_id, FL.fmin, FR.rank
00758             FROM feature_relationship FR              
00759               INNER JOIN featureloc FL on FL.feature_id = FR.subject_id";
00760     if (!$connection) {
00761       $sql .= "WHERE FR.object_id = %d ".
00762               "ORDER BY FL.fmin ASC ";
00763     }
00764     else {
00765       $sql = "PREPARE sel_gffchildren (int) AS " . $sql . " WHERE FR.object_id = \$1 ORDER BY FL.fmin ASC";            
00766     }
00767     if (!tripal_core_is_sql_prepared('sel_gffchildren')) {
00768       $success = tripal_core_chado_prepare('sel_gffchildren', $sql, array('int'));
00769       if (!$success) {
00770         watchdog("T_gff3_loader", "Cannot prepare statement 'sel_gffchildren' and cannot set children ranks.", 
00771            array(), WATCHDOG_WARNING);
00772         return 0;  
00773       }
00774     }
00775     
00776     // now set the rank of any parent/child relationships.  The order is based
00777     // on the fmin.  The start rank is 1.  This allows features with other
00778     // relationships to be '0' (the default), and doesn't interfer with the
00779     // ordering defined here.        
00780     while ($parent = db_fetch_object($parents)) {
00781       
00782       // get the children
00783       if ($connection) {      
00784         $result = chado_query('EXECUTE sel_gffchildren (%d)', $parent->feature_id);
00785       }
00786       else {
00787         $result = chado_query($sql, $parent->feature_id);
00788       }
00789       
00790       // build an array of the children
00791       $children = array();
00792       while ($child = db_fetch_object($result)) {
00793          $children[] = $child;  
00794       }
00795       
00796       // the children list comes sorted in ascending fmin
00797       // but if the parent is on the reverse strand we need to 
00798       // reverse the order of the children.
00799       if ($parent->strand == -1) {
00800         arsort($children);
00801       }    
00802   
00803       // first set the ranks to a negative number so that we don't
00804       // get a duplicate error message when we try to change any of them    
00805       $rank = -1;
00806       foreach ($children as $child) {
00807         $match = array('feature_relationship_id' => $child->feature_relationship_id);
00808         $options = array('statement_name' => 'upd_featurerelationship_rank');      
00809         $values = array('rank' => $rank);      
00810         tripal_core_chado_update('feature_relationship', $match, $values, $options);
00811         $rank--;
00812       }
00813       // now set the rank correctly. The rank should start at 0.
00814       $rank = 0;
00815       foreach ($children as $child) {
00816         $match = array('feature_relationship_id' => $child->feature_relationship_id);
00817         $options = array('statement_name' => 'upd_featurerelationship_rank');      
00818         $values = array('rank' => $rank); 
00819         //print "Was: " . $child->rank . " now $rank ($parent->strand)\n"     ;
00820         tripal_core_chado_update('feature_relationship', $match, $values, $options);
00821         $rank++;
00822       }
00823     }
00824   }
00825   
00826   // commit the transaction
00827   if ($use_transaction) {
00828     tripal_db_commit_transaction();
00829   }
00830   print "Done\n";
00831   
00832   return 1;
00833 }
00839 function tripal_feature_load_gff3_derives_from($feature, $subject, $organism) {
00840 
00841   // get the subject type
00842   $values = array(
00843     'organism_id' => $organism->organism_id,
00844     'uniquename' => $subject,
00845   );
00846   $options = array('statement_name' => 'sel_tripalgfftemp_orun');
00847   $result = tripal_core_chado_select('tripal_gff_temp', array('type_name'), $values, $options);   
00848   if (count($result) == 0) {
00849     watchdog("T_gff3_loader", "Cannot find subject type for feature in 'derives_from' relationship: %subject", array('%subject' => $subject), WATCHDOG_WARNING);
00850      return ''; 
00851   }
00852   $subject_type = $result[0]->type_name;
00853   
00854   // get the subject feature
00855   $match = array(
00856     'organism_id' => $organism->organism_id,
00857     'uniquename' => $subject,
00858     'type_id' => array(
00859       'name' => $subject_type,
00860       'cv_id' => array(
00861         'name' => 'sequence'
00862       ),
00863     ),      
00864   );
00865   $options = array('statement_name' => 'sel_feature_orunty');
00866   $sfeature = tripal_core_chado_select('feature', array('feature_id'), $match, $options);
00867   if (count($sfeature)==0) {
00868     watchdog('T_gff3_loader', "Could not add 'Derives_from' relationship ".
00869       "for %uniquename and %subject.  Subject feature, '%subject', ".
00870       "cannot be found", array('%uniquename' => $feature->uniquename, '%subject' => $subject), WATCHDOG_ERROR);
00871     return;
00872   }
00873 
00874    // now check to see if the relationship already exists
00875   $values = array(
00876     'object_id' => $sfeature[0]->feature_id,
00877     'subject_id' => $feature->feature_id,
00878     'type_id' => array(
00879        'cv_id' => array(
00880           'name' => 'relationship'
00881         ),
00882        'name' => 'derives_from',
00883     ),
00884     'rank' => 0
00885   );
00886   $options = array('statement_name' => 'sel_featurerelationship_objectid_subjectid_typeid_rank');
00887   $rel = tripal_core_chado_select('feature_relationship', array('*'), $values, $options);
00888   if (count($rel) > 0) {
00889     return;
00890   }
00891 
00892   // finally insert the relationship if it doesn't exist
00893   $options = array('statement_name' => 'ins_featurerelationship_objectid_subjectid_typeid_rank');
00894   $ret = tripal_core_chado_insert('feature_relationship', $values, $options);
00895   if (!$ret) {
00896     watchdog("T_gff3_loader", "Could not add 'Derives_from' relationship for $feature->uniquename and $subject", 
00897       array(), WATCHDOG_WARNING);
00898   }
00899 }
00905 function tripal_feature_load_gff3_parents($feature, $cvterm, $parents, $organism_id, $fmin) {
00906 
00907   $uname = $feature->uniquename;
00908   $type = $cvterm->name;
00909   $rel_type = 'part_of';
00910 
00911   // prepare these SQL statements that will be used repeatedly.
00912   if (!tripal_core_is_sql_prepared('sel_cvterm_cvname_cvtname_synonym')) {
00913     $psql = "PREPARE sel_cvterm_cvname_cvtname_synonym (text, text, text) AS
00914              SELECT CVT.cvterm_id
00915              FROM cvterm CVT
00916                INNER JOIN cv CV on CVT.cv_id = CV.cv_id
00917                LEFT JOIN cvtermsynonym CVTS on CVTS.cvterm_id = CVT.cvterm_id
00918              WHERE cv.name = $1 and (CVT.name = $2 or CVTS.synonym = $3)";
00919     $status = tripal_core_chado_prepare('sel_cvterm_cvname_cvtname_synonym', $psql, array('text', 'text' ,'text'));
00920     if (!$status) {
00921        watchdog("T_gff3_loader", "Cannot prepare statement 'sel_cvterm_cvname_cvtname_synonym' for ontology term", 
00922          array(), WATCHDOG_WARNING);
00923        return '';
00924     }
00925   }
00926 
00927   // iterate through the parents in the list
00928   foreach ($parents as $parent) {
00929     // get the parent cvterm
00930     $values = array(
00931       'organism_id' => $organism_id,
00932       'uniquename' => $parent,
00933     );
00934     $options = array('statement_name' => 'sel_tripalgfftemp_orun');
00935     $result = tripal_core_chado_select('tripal_gff_temp', array('type_name'), $values, $options);    
00936     if (count($result) == 0) {
00937       watchdog("T_gff3_loader", "Cannot find parent: %parent", array('%parent' => $parent), WATCHDOG_WARNING);
00938        return '';  
00939     }
00940     $parent_type = $result[0]->type_name;
00941 
00942     // try to find the parent
00943     $parentcvterm = db_fetch_object(chado_query("EXECUTE sel_cvterm_cvname_cvtname_synonym ('%s', '%s', '%s')", 'sequence', $parent_type, $parent_type));
00944     $relcvterm = db_fetch_object(chado_query("EXECUTE sel_cvterm_cvname_cvtname_synonym ('%s', '%s', '%s')", 'relationship', $rel_type, $rel_type));
00945     $values = array(
00946         'organism_id' => $organism_id,
00947         'uniquename' => $parent,
00948         'type_id' => $parentcvterm->cvterm_id,
00949     );
00950     $options = array('statement_name' => 'sel_feature_orunty');
00951     $result = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
00952     $parent_feature = $result[0];
00953 
00954     // if the parent exists then add the relationship otherwise print error and skip
00955     if ($parent_feature) {
00956 
00957       // check to see if the relationship already exists
00958       $values = array(
00959         'object_id' => $parent_feature->feature_id,
00960         'subject_id' => $feature->feature_id,
00961         'type_id' => $relcvterm->cvterm_id,
00962       );
00963       $options = array('statement_name' => 'sel_featurerelationship_objectid_subjectid_typeid');
00964       $rel = tripal_core_chado_select('feature_relationship', array('*'), $values, $options);
00965 
00966       if (count($rel) > 0) {
00967       }
00968       else {
00969         // the relationship doesn't already exist, so add it.
00970         $values = array(
00971            'subject_id' => $feature->feature_id,
00972            'object_id'  => $parent_feature->feature_id,
00973            'type_id' => $relcvterm->cvterm_id,
00974         );
00975         $options = array('statement_name' => 'ins_featurerelationship_subjectid_objectid_typeid');
00976         $result = tripal_core_chado_insert('feature_relationship', $values, $options);
00977         if (!$result) {
00978           watchdog("T_gff3_loader", "Failed to insert feature relationship '$uname' ($type) $rel_type '$parent' ($parent_type)", 
00979             array(), WATCHDOG_WARNING);
00980         }
00981       }
00982     }
00983     else {
00984       watchdog("T_gff3_loader", "Cannot establish relationship '$uname' ($type) $rel_type '$parent' ($parent_type): Cannot find the parent", 
00985         array(), WATCHDOG_WARNING);      
00986     }
00987   }
00988 }
00989 
00996 function tripal_feature_load_gff3_dbxref($feature, $dbxrefs) {
00997 
00998   // iterate through each of the dbxrefs
00999   foreach ($dbxrefs as $dbxref) {
01000 
01001     // get the database name from the reference.  If it doesn't exist then create one.
01002     $ref = explode(":", $dbxref);
01003     $dbname = $ref[0];
01004     $accession = $ref[1];
01005 
01006     // first look for the database name if it doesn't exist then create one.
01007     // first check for the fully qualified URI (e.g. DB:<dbname>. If that
01008     // can't be found then look for the name as is.  If it still can't be found
01009     // the create the database
01010     $values = array('name' => "DB:$dbname");
01011     $options = array('statement_name' => 'sel_db_name');
01012     $db = tripal_core_chado_select('db', array('db_id'), $values, $options);
01013     if (count($db) == 0) {
01014       $values = array('name' => "$dbname");
01015       $db = tripal_core_chado_select('db', array('db_id'), $values, $options);
01016     }
01017     if (count($db) == 0) {
01018       $values = array(
01019         'name' => $dbname,
01020         'description' => 'Added automatically by the GFF loader'
01021       );
01022       $options = array('statement_name' => 'ins_db_name');
01023       $success = tripal_core_chado_insert('db', $values, $options);
01024       if ($success) {
01025         $values = array('name' => "$dbname");
01026         $options = array('statement_name' => 'sel_db_name');        
01027         $db = tripal_core_chado_select('db', array('db_id'), $values, $options);
01028       }
01029       else {
01030         watchdog("T_gff3_loader", "Cannot find or add the database $dbname", array(), WATCHDOG_WARNING);
01031         return 0;
01032       }
01033     }
01034     $db = $db[0];
01035 
01036     // now check to see if the accession exists
01037     $values = array(
01038       'accession' => $accession, 
01039       'db_id' => $db->db_id
01040     );
01041     $options = array('statement_name' => 'sel_dbxref_accession_dbid');
01042     $dbxref = tripal_core_chado_select('dbxref', array('dbxref_id'), $values, $options);
01043 
01044     // if the accession doesn't exist then we want to add it
01045     if (sizeof($dbxref) == 0) {
01046       $values = array(
01047         'db_id' => $db->db_id,
01048         'accession' => $accession, 
01049         'version' => ''
01050       );
01051       $options = array('statement_name' => 'ins_dbxref_dbid_accession_version');
01052       $ret = tripal_core_chado_insert('dbxref', $values, $options);
01053       $values = array(
01054         'accession' => $accession, 
01055         'db_id' => $db->db_id
01056       );
01057       $options = array('statement_name' => 'sel_dbxref_accession_dbid');
01058       $dbxref = tripal_core_chado_select('dbxref', array('dbxref_id'), $values, $options);
01059     }
01060     $dbxref = $dbxref[0];
01061 
01062     // check to see if this feature dbxref already exists
01063     $values = array(
01064       'dbxref_id' => $dbxref->dbxref_id, 
01065       'feature_id' => $feature->feature_id
01066     );
01067     $options = array('statement_name' => 'sel_featuredbxref_dbxrefid_featureid');
01068     $fdbx = tripal_core_chado_select('feature_dbxref', array('feature_dbxref_id'), $values, $options);
01069 
01070     // now associate this feature with the database reference if it doesn't
01071     // already exist
01072     if (sizeof($fdbx) == 0) {
01073       $values = array(
01074         'dbxref_id' => $dbxref->dbxref_id,
01075         'feature_id' => $feature->feature_id
01076       );
01077       $options = array('statement_name' => 'ins_featuredbxref_dbxrefid_featureid');
01078       $success = tripal_core_chado_insert('feature_dbxref', $values, $options);
01079       if (!$success) {
01080         watchdog("T_gff3_loader", "Failed to insert Dbxref: $dbname:$accession", array(), WATCHDOG_WARNING);
01081         return 0;
01082       }
01083     }
01084   }
01085   return 1;
01086 }
01092 function tripal_feature_load_gff3_ontology($feature, $dbxrefs) {
01093 
01094    // iterate through each of the dbxrefs
01095   foreach ($dbxrefs as $dbxref) {
01096 
01097     // get the database name from the reference.  If it doesn't exist then create one.
01098     $ref = explode(":", $dbxref);
01099     $dbname = $ref[0];
01100     $accession = $ref[1];
01101 
01102     // first look for the database name
01103     $options = array('statement_name' => 'sel_db_name');
01104     $db = tripal_core_chado_select('db', array('db_id'), array('name' => "DB:$dbname"), $options);
01105     if (sizeof($db) == 0) {
01106       // now look for the name without the 'DB:' prefix.
01107       $db = tripal_core_chado_select('db', array('db_id'), array('name' => "$dbname"), $options);
01108       if (sizeof($db) == 0) {
01109         watchdog("T_gff3_loader", "Database, $dbname, is not present. Cannot associate term: $dbname:$accession", array(), WATCHDOG_WARNING);
01110         return 0;
01111       }
01112     }
01113     $db = $db[0];
01114 
01115     // now check to see if the accession exists
01116     $options = array('statement_name' => 'sel_dbxref_accession_dbid');
01117     $dbxref = tripal_core_chado_select('dbxref', array('dbxref_id'), 
01118       array('accession' => $accession, 'db_id' => $db->db_id), $options);
01119     if (sizeof($dbxref) == 0) {
01120       watchdog("T_gff3_loader", "Accession, $accession is missing for reference: $dbname:$accession", array(), WATCHDOG_WARNING);
01121       return 0;
01122     }
01123     $dbxref = $dbxref[0];
01124 
01125     // now check to see if the cvterm exists
01126     $options = array('statement_name' => 'sel_cvterm_dbxrefid');
01127     $cvterm = tripal_core_chado_select('cvterm', array('cvterm_id'), array(
01128        'dbxref_id' => $dbxref->dbxref_id), $options);
01129     // if it doesn't exist in the cvterm table, look for an alternate id
01130     if (sizeof($cvterm) == 0) {
01131       $options = array('statement_name' => 'sel_cvtermdbxref_dbxrefid');
01132       $cvterm = tripal_core_chado_select('cvterm_dbxref', array('cvterm_id'), array(
01133         'dbxref_id' => $dbxref->dbxref_id), $options);
01134       if (sizeof($cvterm) == 0) {
01135         watchdog("T_gff3_loader", "CV Term is missing for reference: $dbname:$accession", array(), WATCHDOG_WARNING);
01136         return 0;
01137       }
01138     }
01139     $cvterm = $cvterm[0];
01140 
01141 
01142     // check to see if this feature cvterm already exists
01143     $options = array('statement_name' => 'sel_featurecvterm_cvtermid_featureid');
01144     $fcvt = tripal_core_chado_select('feature_cvterm', array('feature_cvterm_id'),
01145       array('cvterm_id' => $cvterm->cvterm_id, 'feature_id' => $feature->feature_id),
01146       $options);
01147 
01148     // now associate this feature with the cvterm if it doesn't already exist
01149     if (sizeof($fcvt)==0) {
01150       $values = array(
01151         'cvterm_id' => $cvterm->cvterm_id,
01152         'feature_id' => $feature->feature_id,
01153         'pub_id' => array(
01154           'uniquename' => 'null',
01155         ),
01156       );
01157       $options = array('statement_name' => 'ins_featurecvterm_cvtermid_featureid_pubid');
01158       $success = tripal_core_chado_insert('feature_cvterm', $values, $options);
01159 
01160       if (!$success) {
01161         watchdog("T_gff3_loader", "Failed to insert ontology term: $dbname:$accession", array(), WATCHDOG_WARNING);
01162         return 0;
01163       }
01164     }
01165   }
01166   return 1;
01167 }
01173 function tripal_feature_load_gff3_alias($feature, $aliases) {
01174 
01175   // make sure we have a 'synonym_type' vocabulary
01176   $select = array('name' => 'synonym_type');
01177   $options = array('statement_name' => 'sel_cv_name');  
01178   $results = tripal_core_chado_select('cv', array('*'), $select, $options);
01179   
01180   if (count($results) == 0) {
01181     // insert the 'synonym_type' vocabulary
01182     $values = array(
01183       'name' => 'synonym_type',
01184       'definition' => 'vocabulary for synonym types',
01185     );
01186     $options = array('statement_name' => 'ins_cv_name_definition');
01187     $success = tripal_core_chado_insert('cv', $values, $options);
01188     if (!$success) {
01189       watchdog("T_gff3_loader", "Failed to add the synonyms type vocabulary", array(), WATCHDOG_WARNING);
01190       return 0;
01191     }
01192     // now that we've added the cv we need to get the record
01193     $options = array('statement_name' => 'sel_cv_name');
01194     $results = tripal_core_chado_select('cv', array('*'), $select, $options);
01195     if (count($results) > 0) {
01196       $syncv = $results[0];
01197     }
01198   } 
01199   else {
01200     $syncv = $results[0];
01201   }
01202 
01203   // get the 'exact' cvterm, which is the type of synonym we're adding
01204   $select = array(
01205      'name' => 'exact',
01206      'cv_id' => array(
01207         'name' => 'synonym_type'
01208      ),
01209   );
01210   $options = array('statement_name' => 'sel_cvterm_name_cvid');
01211   $result = tripal_core_chado_select('cvterm', array('*'), $select, $options);  
01212   if (count($result) == 0) {
01213     $term = array(
01214       'name' => 'exact',
01215       'id' => "internal:exact",
01216       'definition' => '',
01217       'is_obsolete' => 0,
01218     );
01219     // TODO: fix the function so it uses prepared statements    
01220     $syntype = tripal_cv_add_cvterm($term, $syncv->name, 0, 1);
01221     if (!$syntype) {
01222       watchdog("T_gff3_loader", "Cannot add synonym type: internal:$type", array(), WATCHDOG_WARNING);
01223       return 0;
01224     }
01225   } 
01226   else { 
01227     $syntype = $result[0];
01228   }
01229  
01230   // iterate through all of the aliases and add each one
01231   foreach ($aliases as $alias) {
01232 
01233     // check to see if the alias already exists in the synonym table
01234     // if not, then add it
01235     $select = array(
01236        'name' => $alias,
01237        'type_id' => $syntype->cvterm_id,
01238     );
01239     $options = array('statement_name' => 'sel_synonym_name_typeid');
01240     $result = tripal_core_chado_select('synonym', array('*'), $select, $options);    
01241     if (count($result) == 0) {
01242       $values = array(
01243          'name' => $alias,
01244          'type_id' => $syntype->cvterm_id,
01245          'synonym_sgml' => '',
01246       );
01247       $options = array('statement_name' => 'ins_synonym_name_typeid_synonymsgml');
01248       $success = tripal_core_chado_insert('synonym', $values, $options);
01249       if (!$success) {
01250         watchdog("T_gff3_loader", "Cannot add alias $alias to synonym table", array(), WATCHDOG_WARNING);
01251         return 0;
01252       }
01253       $options = array('statement_name' => 'sel_synonym_name_typeid');
01254       $result = tripal_core_chado_select('synonym', array('*'), $select, $options);
01255       $synonym = $result[0];
01256     }
01257     else {
01258       $synonym = $result[0];  
01259     }
01260 
01261     // check to see if we have a NULL publication in the pub table.  If not,
01262     // then add one.
01263     // @coder-ignore: non-drupal schema thus table prefixing does not apply
01264     $select = array('uniquename' => 'null');
01265     $options = array('statement_name' => 'sel_pub_uniquename');
01266     $result = tripal_core_chado_select('pub', array('*'), $select, $options);    
01267     if (count($result) == 0) {
01268       // prepare the statement
01269       if (!tripal_core_is_sql_prepared('ins_pub_uniquename_typeid')) {
01270         $psql = "PREPARE ins_pub_uniquename_typeid (text, text) AS
01271                  INSERT INTO pub (uniquename,type_id) VALUES ('%s',
01272                  (SELECT cvterm_id
01273                   FROM cvterm CVT
01274                     INNER JOIN dbxref DBX on DBX.dbxref_id = CVT.dbxref_id
01275                     INNER JOIN db DB on DB.db_id = DBX.db_id
01276                   WHERE CVT.name = $1 and DB.name = $2)";
01277         $status = tripal_core_chado_prepare('ins_pub_uniquename_typeid', $psql, args('text', 'text'));
01278         if (!$status) {
01279           watchdog("T_gff3_loader", "Cannot prepare statement 'ins_pub_uniquename_typeid", array(), WATCHDOG_WARNING);
01280           return 0;
01281         } 
01282       }    
01283       // insert the null pub 
01284       $result = db_fetch_object(chado_query("EXECUTE ins_pub_uniquename_typeid ('%s', '%s')", 'null', 'null'));
01285       if (!$result) {
01286         watchdog("T_gff3_loader", "Cannot add null publication needed for setup of alias", array(), WATCHDOG_WARNING);
01287         return 0;
01288       }
01289       $options = array('statement_name' => 'sel_pub_uniquename');
01290       $result = tripal_core_chado_select('pub', array('*'), $select, $options);
01291       $pub = $result[0];
01292     }
01293     else {
01294       $pub = $result[0];  
01295     }
01296 
01297     // check to see if the synonym exists in the feature_synonym table
01298     // if not, then add it.
01299     $values = array(
01300        'synonym_id' => $synonym->synonym_id,
01301        'feature_id' => $feature->feature_id,
01302        'pub_id' => $pub->pub_id,
01303     );
01304     $columns = array('feature_synonym_id');
01305     $options = array('statement_name' => 'sel_featuresynonym_syfepu');
01306     $result = tripal_core_chado_select('feature_synonym', $columns, $values, $options);
01307     if (count($result) == 0) {      
01308       $values = array(
01309          'synonym_id' => $synonym->synonym_id,
01310          'feature_id' => $feature->feature_id,
01311          'pub_id' => $pub->pub_id,
01312       );  
01313       $ins_options = array('statement_name' => 'ins_featuresynonym_syfepu');
01314       $success = tripal_core_chado_insert('feature_synonym', $values, $ins_options);   
01315      
01316       if (!$success) {
01317         watchdog("T_gff3_loader", "Cannot add alias $alias to feature synonym table", array(), WATCHDOG_WARNING);
01318         return 0;
01319       }
01320     }
01321   }
01322   return 1;
01323 }
01324 
01330 function tripal_feature_load_gff3_feature($organism, $analysis_id, $cvterm, $uniquename,
01331   $name, $residues, $is_analysis = 'f', $is_obsolete = 'f', $add_only, $score) {
01332 
01333   // check to see if the feature already exists
01334   $feature = NULL;
01335   $fselect = array(
01336      'organism_id' => $organism->organism_id,
01337      'uniquename' => $uniquename,
01338      'type_id' => $cvterm->cvterm_id
01339   );
01340   $options = array('statement_name' => 'sel_feature_orunty');
01341   $columns = array('feature_id', 'name', 'uniquename', 'seqlen', 'organism_id', 'type_id');
01342   $result = tripal_core_chado_select('feature', $columns, $fselect, $options);
01343   if (count($result) > 0) {
01344     $feature = $result[0];
01345   }
01346 
01347   if (strcmp($is_obsolete, 'f')==0 or $is_obsolete == 0) {
01348     $is_obsolete = 'FALSE';
01349   }
01350   if (strcmp($is_obsolete, 't')==0 or $is_obsolete == 1) {
01351     $is_obsolete = 'TRUE';
01352   }
01353   if (strcmp($is_analysis, 'f')==0 or $is_analysis == 0) {
01354     $is_analysis = 'FALSE'; 
01355   }
01356   if (strcmp($is_analysis, 't')==0 or $is_analysis == 1) {
01357     $is_analysis = 'TRUE'; 
01358   }
01359 
01360   // insert the feature if it does not exist otherwise perform an update
01361   if (!$feature) {
01362     $values = array(
01363        'organism_id' => $organism->organism_id,
01364        'name' => $name,
01365        'uniquename' => $uniquename,
01366 //       'residues' => $residues,
01367 //       'seqlen' => drupal_strlen($residues),
01368        'md5checksum' => md5($residues),
01369        'type_id' => $cvterm->cvterm_id,
01370        'is_analysis' => $is_analysis,
01371        'is_obsolete' => $is_obsolete,
01372     );
01373     $options = array('statement_name' => 'ins_feature_all');
01374     $result = tripal_core_chado_insert('feature', $values, $options);
01375     if (!$result) {
01376       watchdog("T_gff3_loader", "Failed to insert feature '$uniquename' ($cvterm->name)", array(), WATCHDOG_WARNING);
01377       return 0;
01378     }
01379   }
01380   elseif (!$add_only) {
01381     $values = array(
01382       'name' => $name,
01383 //      'residues' => $residues,
01384 //      'seqlen' => drupal_strlen($residues),
01385       'md5checksum' => md5($residues),
01386       'is_analysis' => $is_analysis,
01387       'is_obsolete' => $is_obsolete,
01388     );    
01389     $match = array(
01390       'organism_id' => $organism->organism_id,
01391       'uniquename' => $uniquename,
01392       'type_id' => $cvterm->cvterm_id,
01393     );
01394     $options = array('statement_name' => 'upd_feature');
01395     $result = tripal_core_chado_update('feature', $match, $values, $options);
01396     if (!$result) {
01397       watchdog("T_gff3_loader", "Failed to update feature '$uniquename' ($cvterm->name)", array(), WATCHDOG_WARNING);
01398       return 0;
01399     }
01400   }
01401   else {
01402     // the feature exists and we don't want to update it so return
01403     // a value of 0.  This will stop all downstream property additions
01404     return 0;
01405   }
01406 
01407   // get the newly added feature
01408   $options = array('statement_name' => 'sel_feature_orunty');
01409   $columns = array('feature_id', 'name', 'uniquename', 'seqlen', 'organism_id', 'type_id');
01410   $result = tripal_core_chado_select('feature', $columns, $fselect, $options);  
01411   $feature = $result[0];
01412 
01413   // add the analysisfeature entry to the analysisfeature table if it doesn't already exist
01414   $af_values = array(
01415     'analysis_id' => $analysis_id,
01416     'feature_id' => $feature->feature_id
01417   );
01418   $options = array('statement_name' => 'sel_analysisfeature_analysisid_featureid');
01419   $afeature = tripal_core_chado_select('analysisfeature', array('analysisfeature_id'), $af_values, $options);
01420   if (count($afeature)==0) {
01421     // if a score is available then set that to be the significance field
01422     if (strcmp($score, '.') != 0) {
01423       $af_values['significance'] = $score;
01424       $options = array('statement_name' => 'ins_analysisfeature_analysisid_featureid_significance');
01425     } 
01426     else {
01427       $options = array('statement_name' => 'ins_analysisfeature_analysisid_featureid');
01428     }
01429     if (!tripal_core_chado_insert('analysisfeature', $af_values, $options)) {
01430       watchdog("T_gff3_loader", "Could not add analysisfeature record: $analysis_id, $feature->feature_id", array(), WATCHDOG_WARNING);
01431     }    
01432   }
01433   else {
01434     // if a score is available then set that to be the significance field
01435     $new_vals = array();
01436     if (strcmp($score, '.')!=0) {
01437       $new_vals['significance'] = $score;
01438     }
01439     else {
01440       $new_vals['significance'] = '__NULL__';
01441     }
01442     if (!$add_only) {
01443       $options = array('statement_name' => 'upd_analysisfeature');
01444       $ret = tripal_core_chado_update('analysisfeature', $af_values, $new_vals, $options);
01445       if (!$ret) {
01446         watchdog("T_gff3_loader", "Could not update analysisfeature record: $analysis_id, $feature->feature_id", array(), WATCHDOG_WARNING);
01447       }
01448     }
01449   }
01450   return $feature;
01451 }
01452 
01458 function tripal_feature_load_gff3_featureloc($feature, $organism, $landmark, $fmin,
01459   $fmax, $strand, $phase, $is_fmin_partial, $is_fmax_partial, $residue_info, $locgroup, 
01460   $landmark_type_id = '', $landmark_organism_id = '', $create_landmark = 0, 
01461   $landmark_is_target = 0) {
01462 
01463   $select = array(
01464     'organism_id' => $landmark_organism_id ? $landmark_organism_id : $organism->organism_id,
01465     'uniquename' => $landmark,
01466   );
01467   $options = array('statement_name' => 'sel_feature_orun');
01468   if ($landmark_type_id) {
01469     $select['type_id'] = $landmark_type_id;
01470     $options = array('statement_name' => 'sel_feature_orunty');
01471   }  
01472   $results = tripal_core_chado_select('feature', array('feature_id'), $select, $options);
01473   
01474   $srcfeature = '';
01475   if (count($results)==0) {
01476     // so we couldn't find the landmark using the uniquename. Let's try the 'name'.
01477     // if we return only a single result then we can proceed. Otherwise give an
01478     $select = array(
01479       'organism_id' => $landmark_organism_id ? $landmark_organism_id : $organism->organism_id,
01480       'name' => $landmark,
01481     );
01482     $options = array('statement_name' => 'sel_feature_orna');    
01483     if ($landmark_type_id) {
01484       $select['type_id'] = $landmark_type_id;
01485       $options = array('statement_name' => 'sel_feature_ornaty');
01486     } 
01487     $results = tripal_core_chado_select('feature', array('feature_id'), $select, $options);
01488     if (count($results) == 0) {
01489        // if the landmark is the target feature in a matched alignment then try one more time to
01490        // find it by querying any feature with the same uniquename. If we find one then use it.
01491        if ($landmark_is_target) {
01492          $select = array('uniquename' => $landmark);
01493          $options = array('statement_name' => 'sel_feature_un');
01494          $results = tripal_core_chado_select('feature', array('feature_id'), $select, $options);
01495          if (count($results) == 1) {
01496            $srcfeature = $results[0]; 
01497          }
01498        }
01499 
01500        if (!$srcfeature) {       
01501          // we couldn't find the landmark feature, so if the user has requested we create it then do so
01502          // but only if we have a type id
01503          if ($create_landmark and $landmark_type_id) {
01504             $values = array(
01505               'organism_id' => $landmark_organism_id ? $landmark_organism_id : $organism->organism_id,
01506               'name' => $landmark,
01507               'uniquename' => $landmark,
01508               'type_id' => $landmark_type_id
01509             );
01510             $options = array('statement_name' => 'ins_feature_ornaunty');
01511             $results = tripal_core_chado_insert('feature', $values, $options);
01512             if (!$results) {
01513               watchdog("T_gff3_loader", "Cannot find landmark feature: '%landmark', nor could it be inserted", 
01514                 array('%landmark' => $landmark), WATCHDOG_WARNING);
01515               return 0;  
01516             }  
01517             $srcfeature = new stdClass();
01518             $srcfeature->feature_id = $results['feature_id'];
01519          } 
01520          else {
01521            watchdog("T_gff3_loader", "Cannot find unique landmark feature: '%landmark'.", 
01522              array('%landmark' => $landmark), WATCHDOG_WARNING);
01523            return 0;
01524          } 
01525        }        
01526     } 
01527     elseif (count($results) > 1) {
01528        watchdog("T_gff3_loader", "multiple landmarks exist with the name: '%landmark'.  Cannot 
01529          resolve which one to use. Cannot add the feature location record", 
01530          array('%landmark' => $landmark), WATCHDOG_WARNING);
01531        return 0;    
01532     } 
01533     else {
01534       $srcfeature = $results[0];
01535     }   
01536   }
01537   elseif (count($results) > 1) {
01538     watchdog("T_gff3_loader", "multiple landmarks exist with the name: '%landmark'.  Cannot 
01539       resolve which one to use. Cannot add the feature location record", 
01540       array('%landmark' => $landmark), WATCHDOG_WARNING);
01541     return 0;  
01542   }
01543   else {
01544     $srcfeature = $results[0];
01545   }    
01546 
01547   // TODO: create an attribute that recognizes the residue_info,locgroup, 
01548   //  is_fmin_partial and is_fmax_partial, right now these are
01549   //  hardcoded to be false and 0 below.
01550 
01551 
01552   // check to see if this featureloc already exists, but also keep track of the
01553   // last rank value
01554   $rank = 0;
01555   $exists = 0;
01556   $select = array('feature_id' => $feature->feature_id);
01557   $options = array(
01558     'statement_name' => 'sel_featureloc_fe',
01559     'order_by' => array(
01560        'rank' => 'ASC'
01561     ),
01562   );  
01563   $locrecs = tripal_core_chado_select('featureloc', array('*'), $select, $options);
01564 
01565   foreach ($locrecs as $featureloc) {
01566     // it is possible for the featureloc->srcfeature_id to be NULL. This can happen if the srcfeature
01567     // is not known (according to chado table field descriptions).  If it's null then just skip this entry
01568     if (!$featureloc->srcfeature_id) {
01569       continue;
01570     }    
01571     $select = array('feature_id' => $featureloc->srcfeature_id);
01572     $options = array('statement_name' => 'sel_feature_fe');
01573     $columns = array('feature_id', 'name');
01574     $locsfeature = tripal_core_chado_select('feature', $columns, $select, $options);   
01575     
01576     // the source feature name and at least the fmin and fmax must be the same
01577     // for an update of the featureloc, otherwise we'll insert a new record.
01578     if (strcmp($locsfeature[0]->name, $landmark)==0 and 
01579        ($featureloc->fmin == $fmin or $featureloc->fmax == $fmax)) {
01580       $match = array('featureloc_id' => $featureloc->featureloc_id);
01581       $values = array();
01582       $exists = 1;
01583       if ($featureloc->fmin != $fmin) {
01584          $values['fmin'] = $fmin;
01585       }
01586       if ($featureloc->fmax != $fmax) {
01587          $values['fmax'] = $fmax;
01588       }
01589       if ($featureloc->strand != $strand) {
01590          $values['strand'] = $strand;
01591       }
01592       if (count($values) > 0) {
01593         $options = array('statement_name' => 'upd_featureloc_all');
01594         tripal_core_chado_update('featureloc', $match, $values, $options);
01595       }
01596     }
01597     $rank = $featureloc->rank + 1;
01598   }
01599   if (!$exists) {
01600 
01601     // this feature location is new so add it
01602     if (strcmp($is_fmin_partial, 'f')==0 or !$is_fmin_partial) {
01603       $is_fmin_partial = 'FALSE';
01604     }
01605     elseif (strcmp($is_fmin_partial, 't')==0 or $is_fmin_partial = 1) {
01606       $is_fmin_partial = 'TRUE';
01607     }
01608     if (strcmp($is_fmax_partial, 'f')==0 or !$is_fmax_partial) {
01609       $is_fmax_partial = 'FALSE';
01610     }
01611     elseif (strcmp($is_fmax_partial, 't')==0 or $is_fmax_partial = 1) {
01612       $is_fmax_partial = 'TRUE';
01613     }
01614     $values = array(
01615        'feature_id'      => $feature->feature_id,
01616        'srcfeature_id'   => $srcfeature->feature_id,
01617        'fmin'            => $fmin,
01618        'is_fmin_partial' => $is_fmin_partial,
01619        'fmax'            => $fmax,
01620        'is_fmax_partial' => $is_fmax_partial,
01621        'strand'          => $strand,
01622        'residue_info'    => $residue_info,
01623        'locgroup'        => $locgroup,
01624        'rank'            => $rank 
01625     );
01626     $options = array('statement_name' => 'ins_featureloc_all');
01627     if ($phase) {
01628       $values['phase'] = $phase;
01629       $options = array('statement_name' => 'ins_featureloc_allphase');
01630     }    
01631     $success = tripal_core_chado_insert('featureloc', $values, $options);
01632     if (!$success) {
01633       watchdog("T_gff3_loader", "Failed to insert featureloc", array(), WATCHDOG_WARNING);
01634       exit;
01635       return 0;
01636     }
01637   }
01638   return 1;
01639 }
01645 function tripal_feature_load_gff3_property($feature, $property, $value) {
01646 
01647   // first make sure the cvterm exists.  if not, then add it
01648   $select = array(
01649      'name' => $property,
01650      'cv_id' => array(
01651         'name' => 'feature_property',
01652      ),
01653   );
01654   $options = array('statement_name' => 'sel_cvterm_name_cvid');
01655   $result = tripal_core_chado_select('cvterm', array('*'), $select, $options);
01656 
01657   // if we don't have a property like this already, then add it otherwise, just return
01658   if (count($result) == 0) {
01659     $term = array(
01660       'id' => "null:$property",
01661       'name' => $property,
01662       'namespace' => 'feature_property',
01663       'is_obsolete' => 0,
01664     );    
01665     $cvterm = (object) tripal_cv_add_cvterm($term, 'feature_property', 0, 0);
01666     if (!$cvterm) {
01667       watchdog("T_gff3_loader", "Cannot add cvterm, $property", array(), WATCHDOG_WARNING);
01668       return 0;  
01669     }
01670   } 
01671   else {
01672     $cvterm = $result[0];
01673   }
01674   
01675 
01676   // check to see if the property already exists for this feature
01677   // if it does but the value is unique then increment the rank and add it.
01678   // if the value is not unique then don't add it.
01679   $add = 1;
01680   $rank = 0;
01681   $select = array(
01682      'feature_id' => $feature->feature_id,
01683      'type_id' => $cvterm->cvterm_id,
01684   );
01685   $options = array(
01686     'statement_name' => 'sel_featureprop_featureid_typeid',
01687     'order_by' => array(
01688       'rank' => 'ASC',
01689     ),
01690   );
01691   $results = tripal_core_chado_select('featureprop', array('*'), $select, $options);
01692   foreach ($results as $prop) {
01693     if (strcmp($prop->value, $value)==0) {
01694       $add = NULL; // don't add it, it already exists
01695     }
01696     $rank = $prop->rank + 1;
01697   }
01698 
01699   // add the property if we pass the check above
01700   if ($add) {
01701     $values = array(
01702        'feature_id' => $feature->feature_id,
01703        'type_id' => $cvterm->cvterm_id,
01704        'value' => $value,
01705        'rank' => $rank,
01706     );
01707     $options = array('statement_name' => 'ins_featureprop_all');
01708     $result = tripal_core_chado_insert('featureprop', $values, $options);
01709     if (!$result) {
01710       watchdog("T_gff3_loader", "cannot add featureprop, $property", array(), WATCHDOG_WARNING);
01711     }
01712   }
01713 }
01714 
01715 /*
01716  * 
01717  */
01718 function tripal_feature_load_gff3_fasta($fh, $interval, &$num_read, &$intv_read, &$line_num) {
01719   print "Loading FASTA sequences\n";
01720   $residues = '';
01721   $sql = " 
01722     PREPARE sel_gfftemp_un (text) AS
01723     SELECT feature_id FROM tripal_gff_temp
01724     WHERE uniquename = $1
01725   ";
01726   $status = tripal_core_chado_prepare('sel_gfftemp_un', $sql, array('text'));
01727   if (!$status) {
01728    watchdog('T_gff3_loader', 'Cannot prepare statement \'sel_gfftemp_un\'.', 
01729      array(), WATCHDOG_ERROR);
01730    return '';  
01731   }
01732   $id = NULL;
01733   
01734   // iterate through the remaining lines of the file
01735   while ($line = fgets($fh)) {
01736     
01737     $line_num++;
01738     $size = drupal_strlen($line);   
01739     $num_read += $size;
01740     $intv_read += $size; 
01741     
01742     $line = trim($line);      
01743     
01744     // update the job status every 1% features
01745     if ($job and $intv_read >= $interval) {
01746       $intv_read = 0;
01747       $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
01748       print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes.\r";
01749       tripal_job_set_progress($job, intval(($num_read / $filesize) * 100));
01750     }
01751     
01752     // if we encounter a definition line then get the name, uniquename,
01753     // accession and relationship subject from the definition line
01754     if (preg_match('/^>/', $line)) {   
01755       // if we are beginning a new sequence then save the last one we 
01756       // just finished.     
01757              
01758       if ($id) {
01759         $sql = "EXECUTE sel_gfftemp_un('%s')";
01760         $result = tripal_core_chado_execute_prepared('sel_gfftemp_un', $sql, array($id));
01761         if (!$result) {
01762           watchdog('T_gff3_loader', 'Cannot find feature to assign FASTA sequence: %uname', 
01763              array('%uname' => $id), WATCHDOG_WARNING); 
01764         }
01765         // if we have a feature then add the residues
01766         else {    
01767           $feature = db_fetch_object($result);    
01768           $values = array('residues' => $residues);
01769           $match = array('feature_id' => $feature->feature_id);
01770           $options = array('statement_name' => 'upd_feature_re');
01771           tripal_core_chado_update('feature', $match, $values, $options);
01772         }
01773       }
01774       // get the feature ID for this ID from the tripal_gff_temp table
01775       $id = preg_replace('/^>(.*)$/', '\1', $line);      
01776       $residues = '';
01777     }
01778     else {
01779       $residues .= trim($line);
01780     }
01781   } 
01782   // add in the last sequence
01783   $sql = "EXECUTE sel_gfftemp_un('%s')";
01784   $result = tripal_core_chado_execute_prepared('sel_gfftemp_un', $sql, array($id));
01785   if (!$result) {
01786     watchdog('T_gff3_loader', 'Cannot find feature to assign FASTA sequence: %uname', 
01787        array('%uname' => $id), WATCHDOG_WARNING); 
01788   }
01789   // if we have a feature then add the residues
01790   else {        
01791     $feature = db_fetch_object($result);    
01792     $values = array('residues' => $residues);
01793     $match = array('feature_id' => $feature->feature_id);
01794     $options = array('statement_name' => 'upd_feature_re');
01795     tripal_core_chado_update('feature', $match, $values, $options);
01796   } 
01797 }
01798 
01799 /*
01800  * 
01801  */
01802 function tripal_feature_load_gff3_target($feature, $tags, $target_organism_id, $target_type, $create_target, $attr_locgroup) {
01803   // format is: "target_id start end [strand]", where strand is optional and may be "+" or "-"
01804   $matched = preg_match('/^(.*?)\s+(\d+)\s+(\d+)(\s+[\+|\-])*$/', trim($tags['Target'][0]), $matches);
01805   
01806   // the organism and type of the target may also be specified as an attribute. If so, then get that
01807   // information
01808   $gff_target_organism = array_key_exists('target_organism', $tags) ? $tags['target_organism'][0] : '';
01809   $gff_target_type = array_key_exists('target_type', $tags) ? $tags['target_type'][0] : '';
01810   
01811   // if we have matches and the Target is in the correct format then load the alignment 
01812   if ($matched) {
01813     $target_feature = $matches[1]; 
01814     $start = $matches[2]; 
01815     $end = $matches[3]; 
01816     // if we have an optional strand, convert it to a numeric value. 
01817     if ($matches[4]) {
01818       if (preg_match('/^\+$/', trim($matches[4]))) {
01819         $target_strand = 1;
01820       }
01821       elseif (preg_match('/^\-$/', trim($matches[4]))) {
01822         $target_strand = -1;
01823       }
01824       else {
01825         $target_strand = 0;
01826       }
01827     }
01828     else {
01829        $target_strand = 0;
01830     }
01831     
01832     $target_fmin = $start - 1;
01833     $target_fmax = $end;
01834     if ($end < $start) {
01835       $target_fmin = $end - 1;
01836       $target_fmax = $start;
01837     }
01838     
01839     // default the target organism to be the value passed into the function, but if the GFF
01840     // file species the target organism then use that instead.
01841     $t_organism_id = $target_organism_id;
01842     if ($gff_target_organism) {
01843       // get the genus and species
01844       $success = preg_match('/^(.*?):(.*?)$/', $gff_target_organism, $matches);
01845       if ($success) {
01846         $values = array(
01847           'genus' => $matches[1],
01848           'species' => $matches[2],
01849         );
01850         $options = array('statement_name' => 'sel_organism_gesp');
01851         $torganism = tripal_core_chado_select('organism', array('organism_id'), $values, $options);
01852         if (count($torganism) == 1) {
01853           $t_organism_id = $torganism[0]->organism_id;
01854         }
01855         else {
01856           watchdog('T_gff3_loader', "Cannot find organism for target %target.", 
01857             array('%target' => $gff_target_organism), WATCHDOG_WARNING);
01858           $t_organism_id = '';                                   
01859         }
01860       }
01861       else {
01862         watchdog('T_gff3_loader', "The target_organism attribute is improperly formatted: %target. 
01863           It should be target_organism=genus:species.", 
01864           array('%target' => $gff_target_organism), WATCHDOG_WARNING);
01865         $t_organism_id = '';                
01866       }
01867     }  
01868   
01869     // default the target type to be the value passed into the function, but if the GFF file
01870     // species the target type then use that instead
01871     $t_type_id = '';
01872     if ($target_type) {
01873       $values = array(
01874         'name' => $target_type,
01875         'cv_id' => array(
01876            'name' => 'sequence',
01877         )
01878       );
01879       $options = array('statement_name' => 'sel_cvterm_nacv');
01880       $type = tripal_core_chado_select('cvterm', array('cvterm_id'), $values, $options);
01881       if (count($type) == 1) {
01882         $t_type_id = $type[0]->cvterm_id;
01883       }
01884       else {
01885         watchdog('T_gff3_loader', "The target type does not exist in the sequence ontology: %type. ", 
01886           array('%type' => $target_type), WATCHDOG_ERROR);
01887         exit;  
01888       }
01889     }
01890     if ($gff_target_type) {
01891       $values = array(
01892         'name' => $gff_target_type,
01893         'cv_id' => array(
01894            'name' => 'sequence',
01895         )
01896       );
01897       $options = array('statement_name' => 'sel_cvterm_nacv');
01898       $type = tripal_core_chado_select('cvterm', array('cvterm_id'), $values, $options);
01899       if (count($type) == 1) {
01900         $t_type_id = $type[0]->cvterm_id;
01901       }
01902       else {
01903         watchdog('T_gff3_loader', "The target_type attribute does not exist in the sequence ontology: %type. ", 
01904           array('%type' => $gff_target_type), WATCHDOG_WARNING);
01905         $t_type_id = '';
01906       }
01907     }                       
01908     
01909     // we want to add a featureloc record that uses the target feature as the srcfeature (landmark)
01910     // and the landmark as the feature.
01911     tripal_feature_load_gff3_featureloc($feature, $organism, $target_feature, $target_fmin, 
01912       $target_fmax, $target_strand, $phase, $attr_fmin_partial, $attr_fmax_partial, $attr_residue_info, 
01913       $attr_locgroup, $t_type_id, $t_organism_id, $create_target, TRUE); 
01914   }
01915   // the target attribute is not correctly formatted
01916   else {
01917     watchdog('T_gff3_loader', "Could not add 'Target' alignment as it is improperly formatted:  '%target'",
01918       array('%target' => $tags['Target'][0]), WATCHDOG_ERROR);            
01919   }
01920 }
 All Classes Files Functions Variables