Tripal v1.0 (6.x-1.0)
fasta_loader.inc
Go to the documentation of this file.
00001 <?php
00002 
00023 function tripal_feature_fasta_load_form( ) {
00024 
00025   $form['fasta_file']= array(
00026     '#type'          => 'textfield',
00027     '#title'         => t('FASTA File'),
00028     '#description'   => t('Please enter the full system path for the FASTA file, or a path within the Drupal
00029                            installation (e.g. /sites/default/files/xyz.obo).  The path must be accessible to the
00030                            server on which this Drupal instance is running.'),
00031     '#required' => TRUE,
00032   );
00033 
00034   // get the list of organisms
00035   $sql = "SELECT * FROM {organism} ORDER BY genus, species";
00036   $org_rset = chado_query($sql);
00037   $organisms = array();
00038   $organisms[''] = '';
00039   while ($organism = db_fetch_object($org_rset)) {
00040     $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
00041   }
00042   $form['organism_id'] = array(
00043    '#title'       => t('Organism'),
00044    '#type'        => t('select'),
00045    '#description' => t("Choose the organism to which these sequences are associated"),
00046    '#required'    => TRUE,
00047    '#options'     => $organisms,
00048   );
00049 
00050   $form['seqtype']= array(
00051     '#type' => 'textfield',
00052     '#title' => t('Sequence Type'),
00053     '#required' => TRUE,
00054     '#description' => t('Please enter the Sequence Ontology (SO) term name that describes the sequences in the FASTA file (e.g. gene, mRNA, protein, etc...)'),
00055   );
00056 
00057 
00058   // get the list of organisms
00059   $sql = "SELECT L.library_id, L.name, CVT.name as type
00060          FROM {library} L
00061             INNER JOIN {cvterm} CVT ON L.type_id = CVT.cvterm_id
00062          ORDER BY name";
00063   $lib_rset = chado_query($sql);
00064   $libraries = array();
00065   $libraries[''] = '';
00066   while ($library = db_fetch_object($lib_rset)) {
00067     $libraries[$library->library_id] = "$library->name ($library->type)";
00068   }
00069   //   $form['library_id'] = array (
00070   //     '#title'       => t('Library'),
00071   //     '#type'        => t('select'),
00072   //     '#description' => t("Choose the library to which these sequences are associated "),
00073   //     '#required'    => FALSE,
00074   //     '#options'     => $libraries,
00075   //     '#weight'      => 5,
00076   //   );
00077   $form['method']= array(
00078     '#type' => 'radios',
00079     '#title' => 'Method',
00080     '#required' => TRUE,
00081     '#options' => array(
00082       t('Insert only'),
00083       t('Update only'),
00084       t('Insert and update'),
00085     ),
00086     '#description' => t('Select how features in the FASTA file are handled.
00087        Select "Insert only" to insert the new features. If a feature already
00088        exists with the same name or unique name and type then it is skipped.
00089        Select "Update only" to only update featues that already exist in the
00090        database.  Select "Insert and Update" to insert features that do
00091        not exist and upate those that do.'),
00092     '#default_value' => 2,
00093   );
00094 
00095   $form['match_type']= array(
00096     '#type' => 'radios',
00097     '#title' => 'Name Match Type',
00098     '#required' => TRUE,
00099     '#options' => array(
00100       t('Name'),
00101       t('Unique name'),
00102     ),
00103     '#description' => t('Used for "updates only" or "insert and update" methods. Not required if method type is "insert".  
00104       Feature data is stored in Chado with both a human-readable
00105       name and a unique name. If the features in your FASTA file are uniquely identified using
00106       a human-readable name then select the "Name" button. If your features are
00107       uniquely identified using the unique name then select the "Unique name" button.  If you
00108       loaded your features first using the GFF loader then the unique name of each
00109       features were indicated by the "ID=" attribute and the name by the "Name=" attribute.
00110       By default, the FASTA loader will use the first word (character string
00111       before the first space) as  the name for your feature. If
00112       this does not uniquely identify your feature consider specifying a regular expression in the advanced section below.
00113       Additionally, you may import both a name and a unique name for each sequence using the advanced options.'),
00114     '#default_value' => 1,
00115   );
00116 
00117   $form['analysis'] = array(
00118     '#type' => 'fieldset',
00119     '#title' => t('Analysis Used to Derive Features'),
00120     '#collapsed' => TRUE
00121   );
00122   $form['analysis']['desc'] = array(
00123     '#type' => 'markup',
00124     '#value' => t("Why specify an analysis for a data load?  All data comes
00125        from some place, even if downloaded from Genbank. By specifying
00126        analysis details for all data uploads, it allows an end user to reproduce the
00127        data set, but at least indicates the source of the data."),
00128   );
00129 
00130   // get the list of organisms
00131   $sql = "SELECT * FROM {analysis} ORDER BY name";
00132   $org_rset = chado_query($sql);
00133   $analyses = array();
00134   $analyses[''] = '';
00135   while ($analysis = db_fetch_object($org_rset)) {
00136     $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)";
00137   }
00138   $form['analysis']['analysis_id'] = array(
00139     '#title'       => t('Analysis'),
00140     '#type'        => t('select'),
00141     '#description' => t("Choose the analysis to which these features are associated"),
00142     '#required'    => TRUE,
00143     '#options'     => $analyses,
00144   );
00145 
00146   // Advanced Options
00147   $form['advanced'] = array(
00148     '#type' => 'fieldset',
00149     '#title' => t('Advanced Options'),
00150     '#collapsible' => TRUE,
00151     '#collapsed' => TRUE
00152   );
00153   $form['advanced']['re_help']= array(
00154     '#type' => 'item',
00155     '#value' => t('A regular expression is an advanced method for extracting information from a string of text.
00156                    Your FASTA file may contain both a human-readable name and a unique name for each sequence.
00157                    If you want to import
00158                    both the name and unique name for all sequences, then you must provide regular expressions
00159                    so that the loader knows how to separate them.
00160                    Otherwise the name and uniquename will be the same.
00161                    By default, this loader will use the first word in the definition
00162                    lines of the FASTA file
00163                    as the name or unique name of the feature.'),
00164   );
00165   $form['advanced']['re_name']= array(
00166     '#type' => 'textfield',
00167     '#title' => t('Regular expression for the name'),
00168     '#required' => FALSE,
00169     '#description' => t('Enter the regular expression that will extract the
00170        feature name from the FASTA definition line. For example, for a
00171        defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
00172        the regular expression for the name would be, "^(.*?)\|.*$".'),
00173   );
00174   $form['advanced']['re_uname']= array(
00175     '#type' => 'textfield',
00176     '#title' => t('Regular expression for the unique name'),
00177     '#required' => FALSE,
00178     '#description' => t('Enter the regular expression that will extract the
00179        feature name from the FASTA definition line. For example, for a
00180        defintion line with a name and unique name separated by a bar \'|\' (>seqname|uniquename),
00181        the regular expression for the unique name would be "^.*?\|(.*)$").'),
00182   );
00183 
00184 
00185   // Advanced database cross-reference optoins
00186   $form['advanced']['db'] = array(
00187     '#type' => 'fieldset',
00188     '#title' => t('External Database Reference'),
00189     '#weight' => 6,
00190     '#collapsed' => TRUE
00191   );
00192   $form['advanced']['db']['re_accession']= array(
00193     '#type' => 'textfield',
00194     '#title' => t('Regular expression for the accession'),
00195     '#required' => FALSE,
00196     '#description' => t('Enter the regular expression that will extract the accession for the external database for each feature from the FASTA definition line.'),
00197     '#weight' => 2
00198   );
00199 
00200   // get the list of databases
00201   $sql = "SELECT * FROM {db} ORDER BY name";
00202   $db_rset = chado_query($sql);
00203   $dbs = array();
00204   $dbs[''] = '';
00205   while ($db = db_fetch_object($db_rset)) {
00206     $dbs[$db->db_id] = "$db->name";
00207   }
00208   $form['advanced']['db']['db_id'] = array(
00209    '#title'       => t('External Database'),
00210    '#type'        => t('select'),
00211    '#description' => t("Plese choose an external database for which these sequences have a cross reference."),
00212    '#required'    => FALSE,
00213    '#options'     => $dbs,
00214    '#weight'      => 1,
00215   );
00216 
00217   $form['advanced']['relationship'] = array(
00218     '#type' => 'fieldset',
00219     '#title' => t('Relationships'),
00220     '#weight' => 6,
00221     '#collapsed' => TRUE
00222   );
00223   $rels = array();
00224   $rels[''] = '';
00225   $rels['part_of'] = 'part of';
00226   $rels['derives_from'] = 'produced by';
00227 
00228 
00229   // Advanced references options
00230   $form['advanced']['relationship']['rel_type']= array(
00231    '#title'       => t('Relationship Type'),
00232    '#type'        => t('select'),
00233    '#description' => t("Use this option to create associations, or relationships between the
00234                         features of this FASTA file and existing features in the database. For
00235                         example, to associate a FASTA file of peptides to existing genes or transcript sequence,
00236                         select the type 'produced by'. For a CDS sequences select the type 'part of'"),
00237    '#required'    => FALSE,
00238    '#options'     => $rels,
00239    '#weight'      => 5,
00240   );
00241   $form['advanced']['relationship']['re_subject']= array(
00242     '#type' => 'textfield',
00243     '#title' => t('Regular expression for the parent'),
00244     '#required' => FALSE,
00245     '#description' => t('Enter the regular expression that will extract the unique
00246                          name needed to identify the existing sequence for which the
00247                          relationship type selected above will apply.'),
00248     '#weight' => 6
00249   );
00250   $form['advanced']['relationship']['parent_type']= array(
00251     '#type' => 'textfield',
00252     '#title' => t('Parent Type'),
00253     '#required' => FALSE,
00254     '#description' => t('Please enter the Sequence Ontology term for the parent.  For example
00255                          if the FASTA file being loaded is a set of proteins that are
00256                          products of genes, then use the SO term \'gene\' or \'transcript\' or equivalent. However,
00257                          this type must match the type for already loaded features.'),
00258     '#weight' => 7
00259   );
00260 
00261   $form['button'] = array(
00262     '#type' => 'submit',
00263     '#value' => t('Import FASTA file'),
00264     '#weight' => 10,
00265   );
00266   return $form;
00267 }
00268 
00274 function tripal_feature_fasta_load_form_validate($form, &$form_state) {
00275   $fasta_file = trim($form_state['values']['fasta_file']);
00276   $organism_id  = $form_state['values']['organism_id'];
00277   $type         = trim($form_state['values']['seqtype']);
00278   $method       = trim($form_state['values']['method']);
00279   $match_type   = trim($form_state['values']['match_type']);
00280   $library_id   = $form_state['values']['library_id'];
00281   $re_name      = trim($form_state['values']['re_name']);
00282   $re_uname     = trim($form_state['values']['re_uname']);
00283   $re_accession = trim($form_state['values']['re_accession']);
00284   $db_id        = $form_state['values']['db_id'];
00285   $rel_type     = $form_state['values']['rel_type'];
00286   $re_subject   = trim($form_state['values']['re_subject']);
00287   $parent_type   = trim($form_state['values']['parent_type']);
00288 
00289   if ($method == 0) {
00290     $method = 'Insert only';
00291   }
00292   if ($method == 1) {
00293     $method = 'Update only';
00294   }
00295   if ($method == 2) {
00296     $method = 'Insert and update';
00297   }
00298 
00299   if ($match_type == 0) {
00300     $match_type = 'Name';
00301   }
00302 
00303   if ($match_type == 1) {
00304     $match_type = 'Unique name';
00305   }
00306 
00307 
00308   if ($re_name and !$re_uname and strcmp($match_type, 'Unique name')==0) {
00309     form_set_error('re_uname', t("You must provide a regular expression to identify the sequence unique name"));
00310   }
00311 
00312   if (!$re_name and $re_uname and strcmp($match_type, 'Name')==0) {
00313     form_set_error('re_name', t("You must provide a regular expression to identify the sequence name"));
00314   }
00315   
00316   // check to see if the file is located local to Drupal
00317   $fasta_file = trim($fasta_file);
00318   $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $fasta_file;
00319   if (!file_exists($dfile)) {
00320     // if not local to Drupal, the file must be someplace else, just use
00321     // the full path provided
00322     $dfile = $fasta_file;
00323   }
00324   if (!file_exists($dfile)) {
00325     form_set_error('fasta_file', t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
00326   }
00327 
00328   // make sure if a relationship is specified that all fields are provided.
00329   if (($rel_type or $parent_type) and !$re_subject) {
00330     form_set_error('re_subject', t("Please provide a regular expression for the parent"));
00331   }
00332   if (($rel_type or $re_subject) and !$parent_type) {
00333     form_set_error('parent_type', t("Please provide a SO term for the parent"));
00334   }
00335   if (($parent_type or $re_subject) and !$rel_type) {
00336     form_set_error('rel_type', t("Please select a relationship type"));
00337   }
00338 
00339 
00340   // make sure if a database is specified that all fields are provided
00341   if ($db_id and !$re_accession) {
00342     form_set_error('re_accession', t("Please provide a regular expression for the accession"));
00343   }
00344   if ($re_accession and !$db_id) {
00345     form_set_error('db_id', t("Please select a database"));
00346   }
00347 
00348   // check to make sure the types exists
00349   $cvtermsql = "SELECT CVT.cvterm_id
00350                FROM {cvterm} CVT
00351                   INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
00352                   LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
00353                WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
00354   $cvterm = db_fetch_object(chado_query($cvtermsql, 'sequence', $type, $type));
00355   if (!$cvterm) {
00356     form_set_error('type', t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
00357   }
00358   if ($rel_type) {
00359     $cvterm = db_fetch_object(chado_query($cvtermsql, 'sequence', $parent_type, $parent_type));
00360     if (!$cvterm) {
00361       form_set_error('parent_type', t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
00362     }
00363   }
00364 
00365   // check to make sure the 'relationship' and 'sequence' ontologies are loaded
00366   $form_state['storage']['dfile'] = $dfile;
00367 }
00368 
00374 function tripal_feature_fasta_load_form_submit($form, &$form_state) {
00375   global $user;
00376 
00377   $dfile        = $form_state['storage']['dfile'];
00378   $organism_id  = $form_state['values']['organism_id'];
00379   $type         = trim($form_state['values']['seqtype']);
00380   $method       = trim($form_state['values']['method']);
00381   $match_type   = trim($form_state['values']['match_type']);
00382   $library_id   = $form_state['values']['library_id'];
00383   $re_name      = trim($form_state['values']['re_name']);
00384   $re_uname     = trim($form_state['values']['re_uname']);
00385   $re_accession = trim($form_state['values']['re_accession']);
00386   $db_id        = $form_state['values']['db_id'];
00387   $rel_type     = $form_state['values']['rel_type'];
00388   $re_subject   = trim($form_state['values']['re_subject']);
00389   $parent_type   = trim($form_state['values']['parent_type']);
00390   $analysis_id = $form_state['values']['analysis_id'];
00391 
00392   if ($method == 0) {
00393     $method = 'Insert only';
00394   }
00395   if ($method == 1) {
00396     $method = 'Update only';
00397   }
00398   if ($method == 2) {
00399     $method = 'Insert and update';
00400   }
00401 
00402   if ($match_type == 0) {
00403     $match_type = 'Name';
00404   }
00405 
00406   if ($match_type == 1) {
00407     $match_type = 'Unique name';
00408   }
00409 
00410   $args = array($dfile, $organism_id, $type, $library_id, $re_name, $re_uname,
00411           $re_accession, $db_id, $rel_type, $re_subject, $parent_type, $method,
00412           $user->uid, $analysis_id, $match_type);
00413   
00414   $fname = preg_replace("/.*\/(.*)/", "$1", $dfile);         
00415   tripal_add_job("Import FASTA file: $fname", 'tripal_feature',
00416     'tripal_feature_load_fasta', $args, $user->uid);
00417 }
00418 
00424 function tripal_feature_load_fasta($dfile, $organism_id, $type,
00425   $library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type,
00426   $re_subject, $parent_type, $method, $uid, $analysis_id,
00427   $match_type, $job = NULL) {
00428 
00429   // begin the transaction
00430   $connection = tripal_db_start_transaction();
00431       
00432   // if we cannot get a connection then let the user know the loading will be slow
00433   if (!$connection) {
00434      print "A persistant connection was not obtained. Loading will be slow\n";
00435   }
00436   else {
00437      print "\nNOTE: Loading of this FASTA file is performed using a database transaction. \n" .
00438            "If the load fails or is terminated prematurely then the entire set of \n" .
00439            "insertions/updates is rolled back and will not be found in the database\n\n";
00440   }
00441 
00442   // first get the type for this sequence
00443   $cvtermsql = "SELECT CVT.cvterm_id
00444                FROM {cvterm} CVT
00445                   INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
00446                   LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
00447                WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
00448   $cvterm = db_fetch_object(chado_query($cvtermsql, 'sequence', $type, $type));     
00449   if (!$cvterm) {
00450     watchdog("T_fasta_loader", "Cannot find the term type: '%type'", array('%type' => $type), WATCHDOG_ERROR);
00451     return 0;
00452   }
00453   if ($parent_type) {
00454     $parentcvterm = db_fetch_object(chado_query($cvtermsql, 'sequence', $parent_type, $parent_type));
00455     if (!$parentcvterm) {
00456       watchdog("T_fasta_loader", "Cannot find the paretne term type: '%type'", array('%type' => $parentcvterm), WATCHDOG_ERROR);
00457       return 0;
00458     }
00459   }
00460   if ($rel_type) {
00461     $relcvterm = db_fetch_object(chado_query($cvtermsql, 'relationship', $rel_type, $rel_type));
00462     if (!$relcvterm) {
00463       watchdog("T_fasta_loader", "Cannot find the relationship term type: '%type'", array('%type' => $relcvterm), WATCHDOG_ERROR);
00464       return 0;
00465     }
00466   }
00467   
00468   print "Opening FASTA file $dfile\n";
00469 
00470   //$lines = file($dfile, FILE_SKIP_EMPTY_LINES);
00471   $fh = fopen($dfile, 'r');
00472   if (!$fh) {
00473     watchdog('T_fasta_loader', "cannot open file: %dfile", array('%dfile' => $dfile), WATCHDOG_ERROR);
00474     return 0;
00475   }
00476   $filesize = filesize($dfile);
00477   $i = 0;
00478 
00479   $name = '';
00480   $uname = '';
00481   $residues = '';  
00482   $interval = intval($filesize * 0.01);
00483   if ($interval < 1) {
00484     $interval = 1;
00485   }
00486   $inv_read = 0;
00487   
00488   // we need to get the table schema to make sure we don't overrun the 
00489   // size of fields with what our regular expressions retrieve
00490   $feature_tbl = tripal_core_get_chado_table_schema('feature');
00491   $dbxref_tbl = tripal_core_get_chado_table_schema('dbxref');
00492 
00493   //foreach ($lines as $line_num => $line) {  
00494   while ($line = fgets($fh)) {
00495     $i++;  // update the line count
00496     $num_read += drupal_strlen($line);   
00497     $intv_read += drupal_strlen($line);
00498 
00499     // if we encounter a definition line then get the name, uniquename,
00500     // accession and relationship subject from the definition line
00501     if (preg_match('/^>/', $line)) {
00502       // if we have a feature name then we are starting a new sequence
00503       // so lets handle the previous one before moving on
00504       if ($name or $uname) {       
00505         tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id,
00506           $accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
00507           $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
00508         $residues = '';
00509         $name = '';
00510         $uname = '';
00511       }
00512 
00513       $line = preg_replace("/^>/", '', $line); // remove the > symbol from the defline
00514      
00515       // get the feature name
00516       if ($re_name) {
00517         if (!preg_match("/$re_name/", $line, $matches)) {
00518           watchdog('trp-fasta', "ERROR: Regular expression for the feature name finds nothing. Line %line.", array('%line' => $i), 'error');
00519         }
00520         elseif (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
00521           watchdog('trp-fasta', "WARNING: Regular expression retrieves a value too long for the feature name. Line %line.", array('%line' => $i), 'error');  
00522         }
00523         else {
00524           $name = trim($matches[1]);
00525         }        
00526       }
00527       else {
00528         // if the match_type is name and no regular expression was provided
00529         // then use the first word as the name, otherwise we don't set the name
00530         if (strcmp($match_type, 'Name')==0) {
00531           if(preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches)){
00532             if (strlen($matches[1]) > $feature_tbl['fields']['name']['length']) {
00533               watchdog('trp-fasta', "WARNING: Regular expression retrieves a feature name too long for the feature name. Line %line.", array('%line' => $i), 'error');  
00534             }
00535             else {
00536               $name = trim($matches[1]);
00537             }
00538           }
00539           else {
00540             watchdog('trp-fasta', "ERROR: Cannot find a feature name. Line %line.", array('%line' => $i), 'error');  
00541           }
00542         }
00543       }
00544       
00545       // get the feature unique name
00546       if ($re_uname) {
00547         if (!preg_match("/$re_uname/", $line, $matches)) {
00548           watchdog('trp-fasta', "ERROR: Regular expression for the feature unique name finds nothing. Line %line.", array('%line' => $i), 'error');
00549         }
00550         $uname = trim($matches[1]);
00551       }
00552       else {
00553         // if the match_type is name and no regular expression was provided
00554         // then use the first word as the name, otherwise, we don't set the unqiuename
00555         if (strcmp($match_type, 'Unique name')==0) {
00556           if(preg_match("/^\s*(.*?)[\s\|].*$/", $line, $matches)){
00557             $uname = trim($matches[1]);
00558           }
00559           else {
00560             watchdog('trp-fasta', "ERROR: Cannot find a feature unique name. Line %line.", array('%line' => $i), 'error');  
00561           }
00562         }
00563       }
00564       // get the accession
00565       preg_match("/$re_accession/", $line, $matches);
00566       if (strlen($matches[1]) > $dbxref_tbl['fields']['accession']['length']) {
00567         watchdog('trp-fasta', "WARNING: Regular expression retrieves an accession too long for the feature name. Cannot add cross reference. Line %line.", array('%line' => $i), 'warning');  
00568       }
00569       else {
00570         $accession = trim($matches[1]);
00571       }
00572 
00573       // get the relationship subject
00574       preg_match("/$re_subject/", $line, $matches);
00575       $subject = trim($matches[1]);
00576     }
00577     else {
00578       $residues .= trim($line);
00579       
00580       // update the job status every % features
00581       if ($job and $intv_read >= $interval) {
00582         $intv_read = 0;
00583         $percent = sprintf("%.2f", ($num_read / $filesize) * 100);
00584         if ($name) {
00585           print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Current feature: $name\r";
00586         }
00587         else {
00588           print "Parsing Line $i (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes. Current feature: $uname\r";  
00589         }
00590         tripal_job_set_progress($job, intval(($num_read / $filesize) * 100));
00591       }
00592     }
00593   }
00594   
00595   // now load the last sequence in the file
00596   tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id,
00597     $accession, $subject, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
00598     $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm);
00599   
00600   // commit the transaction
00601   tripal_db_commit_transaction();
00602   print "\nDone\n";
00603 }
00604 
00610 function tripal_feature_fasta_loader_handle_feature($name, $uname, $db_id, $accession,
00611   $parent, $rel_type, $parent_type, $analysis_id, $organism_id, $cvterm,
00612   $source, $residues, $method, $re_name, $match_type, $parentcvterm, $relcvterm) {
00613 
00614   // check to see if this feature already exists if the match_type is 'Name'
00615   if (strcmp($match_type, 'Name')==0) {
00616     $values = array(
00617       'organism_id' => $organism_id,
00618       'name' => $name,
00619       'type_id' => $cvterm->cvterm_id,    
00620     );
00621     $options = array('statement_name' => 'sel_feature_ornaty');
00622     $results = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
00623     if (count($results) > 1) {
00624       watchdog('T_fasta_loader', "Multiple features exist with the name '%name' of type 
00625                '%type' for the organism.  skipping", array('%name' => $name, '%type' => $type));
00626       return 0;
00627     } 
00628     if (count($results) == 1) {  
00629       $feature = $results[0];
00630     }     
00631   }
00632   // check to see if this feature already exists if the match_type is 'Unique Name'
00633   if (strcmp($match_type, 'Unique name')==0) {
00634     $values = array(
00635       'organism_id' => $organism_id,
00636       'uniquename' => $uname,
00637       'type_id' => $cvterm->cvterm_id,    
00638     );
00639 
00640     $options = array('statement_name' => 'sel_feature_oruqty');
00641     $results = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
00642     if (count($results) > 1) {
00643       watchdog('T_fasta_loader', "Multiple features exist with the name '%name' of type 
00644                '%type' for the organism.  skipping", array('%name' => $name, '%type' => $type));
00645       return 0;
00646     } 
00647     if (count($results) == 1) {  
00648       $feature = $results[0];
00649     }     
00650     
00651     // if the feature exists but this is an "insert only" method then skip this feature 
00652     if ($feature and (strcmp($method, 'Insert only')==0)) {
00653       watchdog('T_fasta_loader', "Feature already exists '%name' ('%uname') while matching on %type. Skipping insert.", 
00654         array('%name' => $name, '%uname' => $uname, '%type' => drupal_strtolower($match_type)), WATCHDOG_WARNING);
00655       return 0;
00656     } 
00657   }
00658 
00659   // if we don't have a feature and we're doing an insert then do the insert
00660   $inserted = 0;
00661   if (!$feature and (strcmp($method, 'Insert only')==0 or strcmp($method, 'Insert and update')==0)) {
00662     // if we have a unique name but not a name then set them to be the same and vice versa
00663     if (!$uname) {
00664       $uname = $name;
00665     }
00666     elseif (!$name) {
00667       $name = $uname;
00668     }
00669     
00670     // insert the feature
00671     $values = array(
00672       'organism_id' => $organism_id,
00673       'name' => $name,
00674       'uniquename' => $uname,
00675       'residues' => $residues,
00676       'seqlen' => drupal_strlen($residues),
00677       'md5checksum' => md5($residues),
00678       'type_id' => $cvterm->cvterm_id,
00679       'is_analysis' => 'FALSE',
00680       'is_obsolete' => 'FALSE',
00681     );
00682     $options = array('statement_name' => 'ins_feature_all');
00683     $success = tripal_core_chado_insert('feature', $values, $options);    
00684     if (!$success) {
00685       watchdog('T_fasta_loader', "Failed to insert feature '%name (%uname)'", 
00686         array('%name' => $name, '%uname' => $numane), WATCHDOG_ERROR);
00687       return 0;
00688     }
00689     
00690     // now get the feature we just inserted    
00691     $values = array(
00692       'organism_id' => $organism_id,
00693       'uniquename' => $uname,
00694       'type_id' => $cvterm->cvterm_id,    
00695     );
00696     $options = array('statement_name' => 'sel_feature_oruqty');
00697     $results = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
00698     if (count($results) == 1) {
00699        $inserted = 1;
00700        $feature = $results[0];
00701     } 
00702     else {
00703       watchdog('T_fasta_loader', "Failed to retreive newly inserted feature '%name (%uname)'", 
00704         array('%name' => $name, '%uname' => $numane), WATCHDOG_ERROR);
00705       return 0;  
00706     }     
00707   }
00708   
00709   // if we don't have a feature and the user wants to do an update then fail
00710   if (!$feature and (strcmp($method, 'Update only')==0 or drupal_strcmp($method, 'Insert and update')==0)) {
00711     watchdog('T_fasta_loader', "Failed to find feature '%name' ('%uname') while matching on " . 
00712       drupal_strtolower($match_type), array('%name' => $name, '%uname' => $uname), WATCHDOG_ERROR);
00713     return 0;
00714   }
00715   
00716   // if we do have a feature and this is an update then proceed with the update
00717   if ($feature and !$inserted and (strcmp($method, 'Update only')==0 or strcmp($method, 'Insert and update')==0)) {
00718     // if the user wants to match on the Name field
00719     if (strcmp($match_type, 'Name')==0) {
00720       // if we're matching on the name but do not have a unique name then we don't want to update the uniquename.  
00721       $values = array();
00722       if ($uname) {
00723         // first check to make sure that by changing the unique name of this feature that we won't conflict with
00724         // another existing feature of the same name
00725         $values = array(
00726           'organism_id' => $organism_id,
00727           'uniquename' => $uname,
00728           'type_id' => $cvterm->cvterm_id,    
00729         );    
00730         $options = array('statement_name' => 'sel_feature_oruqty');
00731         $results = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
00732         if (count($results) > 0) {
00733           watchdog('T_fasta_loader', "Cannot update the feature '%name' with a uniquename of '%uname' and type of '%type' as it 
00734             conflicts with an existing feature with the same uniquename and type.", 
00735             array('%name' => $name, '%uname' => $uname, '%type' => $type));
00736           return 0;
00737         } 
00738         
00739         // the changes to the uniquename don't conflict so proceed with the update
00740         $values = array(
00741           'uniquename' => $uname,
00742           'residues' => $residues,
00743           'seqlen' => drupal_strlen($residues),
00744           'md5checksum' => md5($residues),
00745           'is_analysis' => 'false',
00746           'is_obsolete' => 'false',
00747         );
00748         $match = array(
00749           'name' => $name,        
00750           'organism_id' => $organism_id,
00751           'type_id' => $cvterm->cvterm_id,        
00752         );
00753         $options = array('statement_name' => 'upd_feature_resemdisis_naorty_un');        
00754       }
00755       // if we do not have a new unique name then don't change the existing uniquename field
00756       else {
00757         $values = array(                 
00758           'residues' => $residues,
00759           'seqlen' => drupal_strlen($residues),
00760           'md5checksum' => md5($residues),
00761           'is_analysis' => 'false',
00762           'is_obsolete' => 'false',
00763         );
00764         $match = array(
00765           'name' => $name,       
00766           'organism_id' => $organism_id,
00767           'type_id' => $cvterm->cvterm_id,        
00768         );
00769         $options = array('statement_name' => 'upd_feature_unresemdisis_naorty'); 
00770       }
00771       
00772       // perform the update
00773       $success = tripal_core_chado_update('feature', $match, $values, $options);
00774       if (!$success) {
00775         watchdog('T_fasta_loader', "Failed to update feature '%name' ('%name')", 
00776           array('%name' => $name, '%uiname' => $uname), WATCHDOG_ERROR);
00777         return 0;
00778       }
00779     }
00780     if (strcmp($match_type, 'Unique name')==0) {
00781       // if we're matching on the uniquename but do not have a new name then we don't want to update the name.  
00782       $values = array();
00783       if ($name) {
00784         $values = array(
00785           'name' => $name,
00786           'residues' => $residues,
00787           'seqlen' => drupal_strlen($residues),
00788           'md5checksum' => md5($residues),
00789           'is_analysis' => 'false',
00790           'is_obsolete' => 'false',
00791         );
00792         $match = array(
00793           'uniquename' => $uname,
00794           'organism_id' => $organism_id,          
00795           'type_id' => $cvterm->cvterm_id,        
00796         );
00797         $options = array('statement_name' => 'upd_feature_resemdisis_unorty_na');       
00798       }
00799       // if we have a unique name then update it after matching by the name
00800       else {
00801         $values = array(                  
00802           'residues' => $residues,
00803           'seqlen' => drupal_strlen($residues),
00804           'md5checksum' => md5($residues),
00805           'is_analysis' => 'false',
00806           'is_obsolete' => 'false',
00807         );
00808         $match = array(
00809           'uniquename' => $uname,       
00810           'organism_id' => $organism_id,
00811           'type_id' => $cvterm->cvterm_id,        
00812         );
00813         $options = array('statement_name' => 'upd_feature_naresemdisis_unorty'); 
00814       }
00815       $success = tripal_core_chado_update('feature', $match, $values, $options);
00816       if (!$success) {
00817         watchdog('T_fasta_loader', "Failed to update feature '%name' ('%name')", 
00818           array('%name' => $name, '%uiname' => $uname), WATCHDOG_ERROR);
00819         return 0;
00820       }
00821     }    
00822   }
00823   
00824   // add in the analysis link
00825   if ($analysis_id) {
00826     // if the association doens't alredy exist then add one
00827     $values = array(
00828       'analysis_id' => $analysis_id,
00829       'feature_id' => $feature->feature_id,
00830     );
00831     $sel_options = array('statement_name' => 'sel_analysisfeature_anfe');
00832     $results = tripal_core_chado_select('analysisfeature', array('analysisfeature_id'), $values, $sel_options);
00833     if (count($results) == 0) {
00834       $ins_options = array('statement_name' => 'ins_analysisfeature_anfe');
00835       $success = tripal_core_chado_insert('analysisfeature', $values, $ins_options);      
00836       if (!$success) {
00837         watchdog('T_fasta_loader', "Failed to associate analysis and feature '%name' ('%name')", 
00838           array('%name' => $name, '%uname' => $uname), WATCHDOG_ERROR);
00839         return 0;  
00840       }
00841     }
00842   }
00843 
00844   // now add the database cross reference
00845   if ($db_id) {
00846     // check to see if this accession reference exists, if not add it
00847     $values = array(
00848       'db_id' => $db_id,
00849       'accession' => $accession
00850     );
00851     $sel_options = array('statement_name' => 'sel_dbxref_dbac');
00852     $results = tripal_core_chado_select('dbxref', array('dbxref_id'), $values, $sel_options);
00853     // if the accession doesn't exist then add it
00854     if (count($results) == 0) {   
00855       $ins_options = array('statement_name' => 'ins_dbxref_dbac');
00856       $results = tripal_core_chado_insert('dbxref', $values, $ins_options);
00857       if (!$results) {
00858         watchdog('T_fasta_loader', "Failed to add database accession '%accession'", 
00859           array('%accession' => $accession), WATCHDOG_ERROR);
00860         return 0;
00861       }
00862       $results = tripal_core_chado_select('dbxref', array('dbxref_id'), $values, $sel_options);
00863       if (count($results) == 1) {
00864         $dbxref = $results[0];
00865       }
00866       else { 
00867         watchdog('T_fasta_loader', "Failed to retreive newly inserted dbxref '%name (%uname)'", 
00868           array('%name' => $name, '%uname' => $numane), WATCHDOG_ERROR);
00869         return 0;
00870       }
00871     } 
00872     else {
00873       $dbxref = $results[0];
00874     }
00875 
00876     // check to see if the feature dbxref record exists if not, then add it
00877     $values = array(
00878       'feature_id' => $feature->feature_id,
00879       'dbxref_id' => $dbxref->dbxref_id
00880     );
00881     $sel_options = array('statement_name' => 'sel_featuredbxref_fedb');
00882     $results = tripal_core_chado_select('feature_dbxref', array('feature_dbxref_id'), $values, $sel_options);
00883     if (count($results) == 0) {  
00884       $ins_options = array('statement_name' => 'ins_featuredbxref_fedb');
00885       $success = tripal_core_chado_insert('feature_dbxref', $values, $ins_options);
00886       if (!$success) {
00887         watchdog('T_fasta_loader', "Failed to add associate database accession '%accession' with feature", 
00888           array('%accession' => $accession), WATCHDOG_ERROR);
00889         return 0;
00890       }
00891     }              
00892   }
00893 
00894    // now add in the relationship if one exists. If not, then add it
00895   if ($rel_type) {
00896     $values = array(
00897       'organism_id' => $organism_id,
00898       'uniquename' => $parent,
00899       'type_id' => $parentcvterm->cvterm_id,    
00900     );
00901     $options = array('statement_name' => 'sel_feature_oruqty');
00902     $results = tripal_core_chado_select('feature', array('feature_id'), $values, $options);
00903     if (count($results) != 1) {
00904       watchdog('T_fasta_loader', "Cannot find a unique fature for the parent '%parent' of type 
00905                '%type' for the feature.", array('%parent' => $parent, '%type' => $parent_type));
00906       return 0;
00907     } 
00908     $parent_feature = $results[0];
00909     
00910    // check to see if the relationship already exists if not then add it
00911     $values = array(
00912       'subject_id' => $feature->feature_id,
00913       'object_id' => $parent_feature->feature_id,
00914       'type_id' => $relcvterm->cvterm_id,    
00915     );
00916     $sel_options = array('statement_name' => 'sel_featurerelationship_suojty');
00917     $results = tripal_core_chado_select('feature_relationship', array('feature_relationship_id'), $values, $sel_options);
00918     if (count($results) == 0) {    
00919       $ins_options = array('statement_name' => 'ins_featurerelationship_suojty');
00920       $success = tripal_core_chado_insert('feature_relationship', $values, $ins_options);
00921       if (!$success) {
00922         watchdog('T_fasta_loader', "Failed to add associate database accession '%accession' with feature", 
00923           array('%accession' => $accession), WATCHDOG_ERROR);
00924         return 0;
00925       }
00926     }        
00927   }
00928 }
00929 
 All Classes Files Functions Variables