Tripal v1.0 (6.x-1.0)
|
00001 <?php 00021 function tripal_feature_gff3_load_form() { 00022 00023 $form['gff_file']= array( 00024 '#type' => 'textfield', 00025 '#title' => t('GFF3 File'), 00026 '#description' => t('Please enter the full system path for the GFF file, or a path within the Drupal 00027 installation (e.g. /sites/default/files/xyz.gff). The path must be accessible to the 00028 server on which this Drupal instance is running.'), 00029 '#required' => TRUE, 00030 ); 00031 // get the list of organisms 00032 $sql = "SELECT * FROM {organism} ORDER BY genus, species"; 00033 $org_rset = chado_query($sql); 00034 $organisms = array(); 00035 $organisms[''] = ''; 00036 while ($organism = db_fetch_object($org_rset)) { 00037 $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)"; 00038 } 00039 $form['organism_id'] = array( 00040 '#title' => t('Organism'), 00041 '#type' => t('select'), 00042 '#description' => t("Choose the organism to which these sequences are associated"), 00043 '#required' => TRUE, 00044 '#options' => $organisms, 00045 ); 00046 00047 // get the list of analyses 00048 $sql = "SELECT * FROM {analysis} ORDER BY name"; 00049 $org_rset = chado_query($sql); 00050 $analyses = array(); 00051 $analyses[''] = ''; 00052 while ($analysis = db_fetch_object($org_rset)) { 00053 $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)"; 00054 } 00055 $form['analysis_id'] = array( 00056 '#title' => t('Analysis'), 00057 '#type' => t('select'), 00058 '#description' => t("Choose the analysis to which these features are associated. 00059 Why specify an analysis for a data load? All data comes 00060 from some place, even if downloaded from Genbank. By specifying 00061 analysis details for all data imports it allows an end user to reproduce the 00062 data set, but at least indicates the source of the data."), 00063 '#required' => TRUE, 00064 '#options' => $analyses, 00065 ); 00066 00067 $form['line_number']= array( 00068 '#type' => 'textfield', 00069 '#title' => t('Start Line Number'), 00070 '#description' => t('Enter the line number in the GFF file where you would like to begin processing. The 00071 first line is line number 1. This option is useful for examining loading problems with large GFF files.'), 00072 '#size' => 10, 00073 ); 00074 00075 $form['landmark_type'] = array( 00076 '#title' => t('Landmark Type'), 00077 '#type' => t('textfield'), 00078 '#description' => t("Optional. Use this field to specify a Sequence Ontology type 00079 for the landmark sequences in the GFF fie (e.g. 'chromosome'). If the GFF file 00080 contains a '##sequence-region' line that describes the landmark sequences to 00081 which all others are aligned and a type is provided here then the features 00082 will be created if they do not already exist. If they do exist then this 00083 field is not used."), 00084 ); 00085 00086 $form['alt_id_attr'] = array( 00087 '#title' => t('ID Attribute'), 00088 '#type' => t('textfield'), 00089 '#description' => t("Optional. Sometimes lines in the GFF file are missing the 00090 required ID attribute that specifies the unique name of the feature. If so, 00091 you may specify an the name of an existing attribute to use for the name."), 00092 ); 00093 00094 $form['import_options'] = array( 00095 '#type' => 'fieldset', 00096 '#title' => t('Import Options'), 00097 '#collapsed' => TRUE 00098 ); 00099 00100 $form['import_options']['use_transaction']= array( 00101 '#type' => 'checkbox', 00102 '#title' => t('Use a transaction'), 00103 '#required' => FALSE, 00104 '#description' => t('Use a database transaction when loading the GFF file. If an error occurs 00105 the entire datset loaded prior to the failure will be rolled back and will not be available 00106 in the database. If this option is unchecked and failure occurs all records up to the point 00107 of failure will be present in the database.'), 00108 '#default_value' => 1, 00109 ); 00110 $form['import_options']['add_only']= array( 00111 '#type' => 'checkbox', 00112 '#title' => t('Import only new features'), 00113 '#required' => FALSE, 00114 '#description' => t('The job will skip features in the GFF file that already 00115 exist in the database and import only new features.'), 00116 ); 00117 $form['import_options']['update']= array( 00118 '#type' => 'checkbox', 00119 '#title' => t('Import all and update'), 00120 '#required' => FALSE, 00121 '#default_value' => 'checked', 00122 '#description' => t('Existing features will be updated and new features will be added. Attributes 00123 for a feature that are not present in the GFF but which are present in the 00124 database will not be altered.'), 00125 ); 00126 $form['import_options']['refresh']= array( 00127 '#type' => 'checkbox', 00128 '#title' => t('Import all and replace'), 00129 '#required' => FALSE, 00130 '#description' => t('Existing features will be updated and feature properties not 00131 present in the GFF file will be removed.'), 00132 ); 00133 $form['import_options']['remove']= array( 00134 '#type' => 'checkbox', 00135 '#title' => t('Delete features'), 00136 '#required' => FALSE, 00137 '#description' => t('Features present in the GFF file that exist in the database 00138 will be removed rather than imported'), 00139 ); 00140 00141 $form['targets'] = array( 00142 '#type' => 'fieldset', 00143 '#title' => t('Targets'), 00144 '#collapsed' => TRUE 00145 ); 00146 $form['targets']['adesc'] = array( 00147 '#type' => 'markup', 00148 '#value' => t("When alignments are represented in the GFF file (e.g. such as 00149 alignments of cDNA sequences to a whole genome, or blast matches), they are 00150 represented using two feature types: 'match' (or cDNA_match, EST_match, etc.) 00151 and 'match_part'. These features may also have a 'Target' attribute to 00152 specify the sequence that is being aligned. 00153 However, the organism to which the aligned sequence belongs may not be present in the 00154 GFF file. Here you can specify the organism and feature type of the target sequences. 00155 The options here will apply to all targets unless the organism and type are explicity 00156 set in the GFF file using the 'target_organism' and 'target_type' attributes."), 00157 ); 00158 $form['targets']['target_organism_id'] = array( 00159 '#title' => t('Target Organism'), 00160 '#type' => t('select'), 00161 '#description' => t("Optional. Choose the organism to which target sequences belong. 00162 Select this only if target sequences belong to a different organism than the 00163 one specified above. And only choose an organism here if all of the target sequences 00164 belong to the same species. If the targets in the GFF file belong to multiple 00165 different species then the organism must be specified using the 'target_organism=genus:species' 00166 attribute in the GFF file."), 00167 '#options' => $organisms, 00168 ); 00169 $form['targets']['target_type'] = array( 00170 '#title' => t('Target Type'), 00171 '#type' => t('textfield'), 00172 '#description' => t("Optional. If the unique name for a target sequence is not unique (e.g. a protein 00173 and an mRNA have the same name) then you must specify the type for all targets in the GFF file. If 00174 the targets are of different types then the type must be specified using the 'target_type=type' attribute 00175 in the GFF file. This must be a valid Sequence Ontology (SO) term."), 00176 ); 00177 $form['targets']['create_target']= array( 00178 '#type' => 'checkbox', 00179 '#title' => t('Create Target'), 00180 '#required' => FALSE, 00181 '#description' => t("If the target feature cannot be found, create one using the organism and type specified above, or 00182 using the 'target_organism' and 'target_type' fields specified in the GFF file. Values specified in the 00183 GFF file take precedence over those specified above."), 00184 ); 00185 00186 $form['button'] = array( 00187 '#type' => 'submit', 00188 '#value' => t('Import GFF3 file'), 00189 '#weight' => 10, 00190 ); 00191 00192 return $form; 00193 } 00194 00200 function tripal_feature_gff3_load_form_validate($form, &$form_state) { 00201 00202 $gff_file = trim($form_state['values']['gff_file']); 00203 $organism_id = $form_state['values']['organism_id']; 00204 $target_organism_id = $form_state['values']['target_organism_id']; 00205 $target_type = trim($form_state['values']['target_type']); 00206 $create_target = $form_state['values']['create_target']; 00207 $add_only = $form_state['values']['add_only']; 00208 $update = $form_state['values']['update']; 00209 $refresh = $form_state['values']['refresh']; 00210 $remove = $form_state['values']['remove']; 00211 $use_transaction = $form_state['values']['use_transaction']; 00212 $line_number = trim($form_state['values']['line_number']); 00213 $landmark_type = trim($form_state['values']['landmark_type']); 00214 $alt_id_attr = trim($form_state['values']['alt_id_attr']); 00215 00216 00217 00218 // check to see if the file is located local to Drupal 00219 $gff_file = trim($gff_file); 00220 $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $gff_file; 00221 if (!file_exists($dfile)) { 00222 // if not local to Drupal, the file must be someplace else, just use 00223 // the full path provided 00224 $dfile = $gff_file; 00225 } 00226 if (!file_exists($dfile)) { 00227 form_set_error('gff_file', t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file.")); 00228 } 00229 00230 // @coder-ignore: there are no functions being called here 00231 if (($add_only AND ($update OR $refresh OR $remove)) OR 00232 ($update AND ($add_only OR $refresh OR $remove)) OR 00233 ($refresh AND ($update OR $add_only OR $remove)) OR 00234 ($remove AND ($update OR $refresh OR $add_only))) { 00235 form_set_error('add_only', t("Please select only one checkbox from the import options section")); 00236 } 00237 00238 if ($line_number and !is_numeric($line_number) or $line_number < 0) { 00239 form_set_error('line_number', t("Please provide an integer line number greater than zero.")); 00240 } 00241 } 00242 00247 function tripal_feature_gff3_load_form_submit($form, &$form_state) { 00248 global $user; 00249 00250 $gff_file = trim($form_state['values']['gff_file']); 00251 $organism_id = $form_state['values']['organism_id']; 00252 $add_only = $form_state['values']['add_only']; 00253 $update = $form_state['values']['update']; 00254 $refresh = $form_state['values']['refresh']; 00255 $remove = $form_state['values']['remove']; 00256 $analysis_id = $form_state['values']['analysis_id']; 00257 $use_transaction = $form_state['values']['use_transaction']; 00258 $target_organism_id = $form_state['values']['target_organism_id']; 00259 $target_type = trim($form_state['values']['target_type']); 00260 $create_target = $form_state['values']['create_target']; 00261 $line_number = trim($form_state['values']['line_number']); 00262 $landmark_type = trim($form_state['values']['landmark_type']); 00263 $alt_id_attr = trim($form_state['values']['alt_id_attr']); 00264 00265 $args = array($gff_file, $organism_id, $analysis_id, $add_only, 00266 $update, $refresh, $remove, $use_transaction, $target_organism_id, 00267 $target_type, $create_target, $line_number, $landmark_type, $alt_id_attr); 00268 00269 $type = ''; 00270 if ($add_only) { 00271 $type = 'import only new features'; 00272 } 00273 if ($update) { 00274 $type = 'import all and update'; 00275 } 00276 if ($refresh) { 00277 $type = 'import all and replace'; 00278 } 00279 if ($remove) { 00280 $type = 'delete features'; 00281 } 00282 $fname = preg_replace("/.*\/(.*)/", "$1", $gff_file); 00283 tripal_add_job("$type GFF3 file: $fname", 'tripal_feature', 00284 'tripal_feature_load_gff3', $args, $user->uid); 00285 00286 return ''; 00287 } 00288 00294 function tripal_feature_load_gff3($gff_file, $organism_id, $analysis_id, 00295 $add_only =0, $update = 0, $refresh = 0, $remove = 0, $use_transaction = 1, 00296 $target_organism_id = NULL, $target_type = NULL, $create_target = 0, 00297 $start_line = 1, $landmark_type = '', $alt_id_attr = '', $job = NULL) { 00298 00299 // make sure our temporary table exists 00300 $ret = array(); 00301 if (!db_table_exists('tripal_gff_temp')) { 00302 $schema = tripal_feature_get_custom_tables('tripal_gff_temp'); 00303 $success = tripal_core_create_custom_table($ret, 'tripal_gff_temp', $schema['tripal_gff_temp']); 00304 if (!$success) { 00305 watchdog('T_gff3_loader', "Cannot create temporary loading table", array(), WATCHDOG_ERROR); 00306 return; 00307 } 00308 } 00309 // empty the temp table 00310 $sql = "DELETE FROM tripal_gff_temp"; 00311 chado_query($sql); 00312 00313 // get a persistent connection 00314 $connection = tripal_db_persistent_chado(); 00315 if (!$connection) { 00316 print "A persistant connection was not obtained. Loading will be slow\n"; 00317 } 00318 00319 // begin the transaction 00320 if ($use_transaction) { 00321 tripal_db_start_transaction(); 00322 00323 // if we cannot get a connection then let the user know the loading will be slow 00324 if (!$connection) { 00325 print "A persistant connection was not obtained. Loading will be slow\n"; 00326 } 00327 else { 00328 print "\nNOTE: Loading of this GFF file is performed using a database transaction. \n" . 00329 "If the load fails or is terminated prematurely then the entire set of \n" . 00330 "insertions/updates is rolled back and will not be found in the database\n\n"; 00331 } 00332 } 00333 00334 // check to see if the file is located local to Drupal 00335 $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $gff_file; 00336 if (!file_exists($dfile)) { 00337 // if not local to Drupal, the file must be someplace else, just use 00338 // the full path provided 00339 $dfile = $gff_file; 00340 } 00341 if (!file_exists($dfile)) { 00342 watchdog('T_gff3_loader', "Cannot find the file: %dfile", 00343 array('%dfile' => $dfile), WATCHDOG_ERROR); 00344 return 0; 00345 } 00346 00347 print "Opening $gff_file\n"; 00348 00349 //$lines = file($dfile,FILE_SKIP_EMPTY_LINES); 00350 $fh = fopen($dfile, 'r'); 00351 if (!$fh) { 00352 watchdog('T_gff3_loader', "cannot open file: %dfile", 00353 array('%dfile' => $dfile), WATCHDOG_ERROR); 00354 return 0; 00355 } 00356 $filesize = filesize($dfile); 00357 00358 // get the controlled vocaubulary that we'll be using. The 00359 // default is the 'sequence' ontology 00360 // @coder-ignore: non-drupal schema thus table prefixing does not apply 00361 $sql = "SELECT * FROM cv WHERE name = '%s'"; 00362 $cv = db_fetch_object(chado_query($sql, 'sequence')); 00363 if (!$cv) { 00364 watchdog('T_gff3_loader', "Cannot find the 'sequence' ontology", 00365 array(), WATCHDOG_ERROR); 00366 return ''; 00367 } 00368 00369 // get the organism for which this GFF3 file belongs 00370 // @coder-ignore: non-drupal schema thus table prefixing does not apply 00371 $sql = "SELECT * FROM organism WHERE organism_id = %d"; 00372 $organism = db_fetch_object(chado_query($sql, $organism_id)); 00373 00374 $interval = intval($filesize * 0.0001); 00375 if ($interval == 0) { 00376 $interval = 1; 00377 } 00378 $in_fasta = 0; 00379 $line_num = 0; 00380 $num_read = 0; 00381 $intv_read = 0; 00382 00383 // prepare the statement used to get the cvterm for each feature. 00384 if (!tripal_core_is_sql_prepared('sel_cvterm_idnasy')) { 00385 $psql = "PREPARE sel_cvterm_idnasy (int, text, text) AS 00386 SELECT CVT.cvterm_id, CVT.cv_id, CVT.name, CVT.definition, 00387 CVT.dbxref_id, CVT.is_obsolete, CVT.is_relationshiptype 00388 FROM {cvterm} CVT 00389 INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id 00390 LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id 00391 WHERE CV.cv_id = $1 and 00392 (lower(CVT.name) = lower($2) or lower(CVTS.synonym) = lower($3))"; 00393 $status = tripal_core_chado_prepare('sel_cvterm_idnasy', $psql, array('int','text','text')); 00394 if (!$status) { 00395 watchdog('T_gff3_loader', 'cannot prepare statement \'sel_cvterm_idnasy\'.', 00396 array(), WATCHDOG_ERROR); 00397 return ''; 00398 00399 } 00400 } 00401 00402 // iterate through each line of the GFF file 00403 print "Parsing Line $line_num (0.00%). Memory: " . number_format(memory_get_usage()) . " bytes\r"; 00404 while ($line = fgets($fh)) { 00405 $line_num++; 00406 $size = drupal_strlen($line); 00407 $num_read += $size; 00408 $intv_read += $size; 00409 00410 if($line_num < $start_line) { 00411 continue; 00412 } 00413 00414 // update the job status every 1% features 00415 if ($job and $intv_read >= $interval) { 00416 $intv_read = 0; 00417 $percent = sprintf("%.2f", ($num_read / $filesize) * 100); 00418 print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes.\r"; 00419 tripal_job_set_progress($job, intval(($num_read / $filesize) * 100)); 00420 } 00421 00422 // check to see if we have FASTA section, if so then set the variable 00423 // to start parsing 00424 if (preg_match('/^##FASTA/i', $line)) { 00425 if($remove) { 00426 // we're done because this is a delete operation so break out of the loop. 00427 break; 00428 } 00429 tripal_feature_load_gff3_fasta($fh, $interval, $num_read, $intv_read, $line_num); 00430 continue; 00431 } 00432 // if the ##sequence-region line is present then we want to add a new feature 00433 if (preg_match('/^##sequence-region (.*?) (\d+) (\d+)$/i', $line, $region_matches)) { 00434 $rid = $region_matches[1]; 00435 $rstart = $region_matches[2]; 00436 $rend = $region_matches[3]; 00437 if ($landmark_type) { 00438 $result = chado_query("EXECUTE sel_cvterm_idnasy (%d, '%s', '%s')", $cv->cv_id, $landmark_type, $landmark_type); 00439 $cvterm = db_fetch_object($result); 00440 if (!$cvterm) { 00441 watchdog('T_gff3_loader', 'cannot find feature term \'%landmark_type\' on line %line_num of the GFF file', 00442 array('%landmark_type' => $landmark_type, '%line_num' => $line_num), WATCHDOG_ERROR); 00443 return ''; 00444 } 00445 tripal_feature_load_gff3_feature($organism, $analysis_id, $cvterm, $rid, 00446 $rid, '', 'f', 'f', 1, 0); 00447 } 00448 continue; 00449 } 00450 00451 // skip comments 00452 if (preg_match('/^#/', $line)) { 00453 continue; 00454 } 00455 00456 // skip empty lines 00457 if (preg_match('/^\s*$/', $line)) { 00458 continue; 00459 } 00460 00461 // get the columns 00462 $cols = explode("\t", $line); 00463 if (sizeof($cols) != 9) { 00464 watchdog('T_gff3_loader', 'improper number of columns on line %line_num', 00465 array('%line_num' => $line_num), WATCHDOG_ERROR); 00466 return ''; 00467 } 00468 00469 // get the column values 00470 $landmark = $cols[0]; 00471 $source = $cols[1]; 00472 $type = $cols[2]; 00473 $start = $cols[3]; 00474 $end = $cols[4]; 00475 $score = $cols[5]; 00476 $strand = $cols[6]; 00477 $phase = $cols[7]; 00478 $attrs = explode(";", $cols[8]); // split by a semicolon 00479 00480 // ready the start and stop for chado. Chado expects these positions 00481 // to be zero-based, so we substract 1 from the fmin 00482 $fmin = $start - 1; 00483 $fmax = $end; 00484 if ($end < $start) { 00485 $fmin = $end - 1; 00486 $fmax = $start; 00487 } 00488 00489 // format the strand for chado 00490 if (strcmp($strand, '.') == 0) { 00491 $strand = 0; 00492 } 00493 elseif (strcmp($strand, '+') == 0) { 00494 $strand = 1; 00495 } 00496 elseif (strcmp($strand, '-') == 0) { 00497 $strand = -1; 00498 } 00499 if (strcmp($phase, '.') == 0) { 00500 $phase = ''; 00501 } 00502 00503 $result = chado_query("EXECUTE sel_cvterm_idnasy (%d, '%s', '%s')", $cv->cv_id, $type, $type); 00504 00505 $cvterm = db_fetch_object($result); 00506 if (!$cvterm) { 00507 watchdog('T_gff3_loader', 'cannot find feature term \'%type\' on line %line_num of the GFF file', 00508 array('%type' => $type, '%line_num' => $line_num), WATCHDOG_ERROR); 00509 return ''; 00510 } 00511 00512 // break apart each of the attributes 00513 $tags = array(); 00514 $attr_name = ''; 00515 $attr_uniquename = ''; 00516 $attr_residue_info = ''; 00517 $attr_locgroup = 0; 00518 $attr_fmin_partial = 'f'; 00519 $attr_fmax_partial = 'f'; 00520 $attr_is_obsolete = 'f'; 00521 $attr_is_analysis = 'f'; 00522 $attr_others = ''; 00523 $residues = ''; 00524 00525 foreach ($attrs as $attr) { 00526 $attr = rtrim($attr); 00527 $attr = ltrim($attr); 00528 if (strcmp($attr, '')==0) { 00529 continue; 00530 } 00531 if (!preg_match('/^[^\=]+\=.+$/', $attr)) { 00532 watchdog('T_gff3_loader', 'Attribute is not correctly formatted on line %line_num: %attr', 00533 array('%line_num' => $line_num, '%attr' => $attr), WATCHDOG_ERROR); 00534 return ''; 00535 } 00536 00537 // break apart each tag 00538 $tag = preg_split("/=/", $attr, 2); // split by equals sign 00539 00540 // multiple instances of an attribute are separated by commas 00541 $tag_name = $tag[0]; 00542 if (!array_key_exists($tag_name, $tags)) { 00543 $tags[$tag_name] = array(); 00544 } 00545 $tags[$tag_name] = array_merge($tags[$tag_name], explode(",", $tag[1])); // split by comma 00546 00547 00548 // replace the URL escape codes for each tag 00549 for ($i = 0; $i < count($tags[$tag_name]); $i++) { 00550 $tags[$tag_name][$i] = urldecode($tags[$tag_name][$i]); 00551 } 00552 00553 // get the name and ID tags 00554 if (strcmp($tag_name, 'ID') == 0) { 00555 $attr_uniquename = urldecode($tag[1]); 00556 } 00557 elseif (strcmp($tag_name, 'Name') == 0) { 00558 $attr_name = urldecode($tag[1]); 00559 } 00560 // get the list of non-reserved attributes 00561 elseif (strcmp($tag_name, 'Alias') !=0 and strcmp($tag_name, 'Parent') !=0 and 00562 strcmp($tag_name, 'Target') !=0 and strcmp($tag_name, 'Gap') !=0 and 00563 strcmp($tag_name, 'Derives_from') !=0 and strcmp($tag_name, 'Note') !=0 and 00564 strcmp($tag_name, 'Dbxref') !=0 and strcmp($tag_name, 'Ontology_term') !=0 and 00565 strcmp($tag_name, 'Is_circular') !=0 and strcmp($tag_name, 'target_organism') !=0 and 00566 strcmp($tag_name, 'target_type') != 0) { 00567 foreach ($tags[$tag_name] as $value) { 00568 $attr_others[$tag_name][] = $value; 00569 } 00570 } 00571 } 00572 00573 // if neither name nor uniquename are provided then generate one 00574 if (!$attr_uniquename and !$attr_name) { 00575 // check if an alternate ID field is suggested, if so, then use 00576 // that for the name 00577 if (array_key_exists($alt_id_attr, $tags)) { 00578 $attr_uniquename = $tags[$alt_id_attr][0]; 00579 $attr_name = $attr_uniquename; 00580 } 00581 // if the row has a parent then generate a uniquename using the parent name 00582 elseif (array_key_exists('Parent', $tags)) { 00583 $attr_uniquename = $tags['Parent'][0] . "-$type-$landmark:$fmin..$fmax"; 00584 $attr_name = $attr_uniquename; 00585 } 00586 // generate a unique name based on the date, type and location 00587 // and set the name to simply be the type 00588 else { 00589 $date = getdate(); 00590 $attr_uniquename = $date[0] . "-$type-$landmark:$fmin..$fmax"; 00591 $attr_name = $type; 00592 } 00593 } 00594 00595 // if a name is not specified then use the unique name 00596 if (strcmp($attr_name, '')==0) { 00597 $attr_name = $attr_uniquename; 00598 } 00599 00600 // if an ID attribute is not specified then use the attribute name and 00601 // hope for the best 00602 if (!$attr_uniquename) { 00603 $attr_uniquename = $attr_name; 00604 } 00605 00606 // make sure the landmark sequence exists in the database. We don't 00607 // know the type of the landmark so we'll hope that it's unique across 00608 // all types. If not we'll error out. This test is only necessary if 00609 // if the landmark and the uniquename are different. If they are the same 00610 // then this is the information for the landmark 00611 if (!$remove and strcmp($landmark, $attr_uniquename) != 0 ) { 00612 $select = array( 00613 'organism_id' => $organism_id, 00614 'uniquename' => $landmark, 00615 ); 00616 $columns = array('count(*) as num_landmarks'); 00617 $options = array('statement_name' => 'sel_feature_numland'); 00618 $count = tripal_core_chado_select('feature', $columns, $select, $options); 00619 if (!$count or count($count) == 0 or $count[0]->num_landmarks == 0) { 00620 watchdog('T_gff3_loader', "The landmark '%landmark' cannot be found for this organism (" . $organism->genus . " " . $organism->species . ") " . 00621 "Please add the landmark and then retry the import of this GFF3 ". 00622 "file", array('%landmark' => $landmark), WATCHDOG_ERROR); 00623 return ''; 00624 00625 } 00626 if ($count[0]->num_landmarks > 1) { 00627 watchdog('T_gff3_loader', "The landmark '%landmark' is not unique for this organism. ". 00628 "The features cannot be associated", array('%landmark' => $landmark), WATCHDOG_ERROR); 00629 return ''; 00630 } 00631 } 00632 00633 // if the option is to remove or refresh then we want to remove 00634 // the feature from the database. 00635 if ($remove or $refresh) { 00636 $sql = "DELETE FROM {feature} 00637 WHERE organism_id = %d and uniquename = '%s' and type_id = %d"; 00638 $match = array( 00639 'organism_id' => $organism->organism_id, 00640 'uniquename' => $attr_uniquename, 00641 'type_id' => $cvterm->cvterm_id 00642 ); 00643 $result = tripal_core_chado_delete('feature', $match); 00644 if (!$result) { 00645 watchdog('T_gff3_loader', "cannot delete feature %attr_uniquename", 00646 array('%attr_uniquename' => $attr_uniquename), WATCHDOG_ERROR); 00647 } 00648 $feature = 0; 00649 unset($result); 00650 } 00651 00652 // add or update the feature and all properties 00653 if ($update or $refresh or $add_only) { 00654 00655 // add/update the feature 00656 $feature = tripal_feature_load_gff3_feature($organism, $analysis_id, $cvterm, 00657 $attr_uniquename, $attr_name, $residues, $attr_is_analysis, 00658 $attr_is_obsolete, $add_only, $score); 00659 00660 if ($feature) { 00661 00662 // add a record for this feature to the tripal_gff_temp table for 00663 // later lookup 00664 $values = array( 00665 'feature_id' => $feature->feature_id, 00666 'organism_id' => $feature->organism_id, 00667 'type_name' => $type, 00668 'uniquename' => $feature->uniquename 00669 ); 00670 // make sure this record doesn't already exist in oru temp table 00671 $options = array('statement_name' => 'sel_tripalgfftemp_all'); 00672 $results = tripal_core_chado_select('tripal_gff_temp', array('*'), $values, $options); 00673 00674 if (count($results) == 0) { 00675 $options = array('statement_name' => 'ins_tripalgfftemp'); 00676 $result = tripal_core_chado_insert('tripal_gff_temp', $values, $options); 00677 if (!$result) { 00678 watchdog('T_gff3_loader', "Cound not save record in temporary table, Cannot continue.", array(), WATCHDOG_ERROR); 00679 exit; 00680 } 00681 } 00682 00683 // add/update the featureloc if the landmark and the ID are not the same 00684 // if they are the same then this entry in the GFF is probably a landmark identifier 00685 if (strcmp($landmark, $attr_uniquename) !=0 ) { 00686 tripal_feature_load_gff3_featureloc($feature, $organism, 00687 $landmark, $fmin, $fmax, $strand, $phase, $attr_fmin_partial, 00688 $attr_fmax_partial, $attr_residue_info, $attr_locgroup); 00689 } 00690 // add any aliases for this feature 00691 if (array_key_exists('Alias', $tags)) { 00692 tripal_feature_load_gff3_alias($feature, $tags['Alias']); 00693 } 00694 // add any dbxrefs for this feature 00695 if (array_key_exists('Dbxref', $tags)) { 00696 tripal_feature_load_gff3_dbxref($feature, $tags['Dbxref']); 00697 } 00698 // add any ontology terms for this feature 00699 if (array_key_exists('Ontology_term', $tags)) { 00700 tripal_feature_load_gff3_ontology($feature, $tags['Ontology_term']); 00701 } 00702 // add parent relationships 00703 if (array_key_exists('Parent', $tags)) { 00704 tripal_feature_load_gff3_parents($feature, $cvterm, $tags['Parent'], $organism_id, $fmin); 00705 } 00706 // add target relationships 00707 if (array_key_exists('Target', $tags)) { 00708 tripal_feature_load_gff3_target($feature, $tags, $target_organism_id, $target_type, $create_target, $attr_locgroup); 00709 } 00710 // add gap information. This goes in simply as a property 00711 if (array_key_exists('Gap', $tags)) { 00712 foreach ($tags['Gap'] as $value) { 00713 tripal_feature_load_gff3_property($feature, 'Gap', $value); 00714 } 00715 } 00716 // add notes. This goes in simply as a property 00717 if (array_key_exists('Note', $tags)) { 00718 foreach ($tags['Note'] as $value) { 00719 tripal_feature_load_gff3_property($feature, 'Note', $value); 00720 } 00721 } 00722 // add the Derives_from relationship (e.g. polycistronic genes). 00723 if (array_key_exists('Derives_from', $tags)) { 00724 tripal_feature_load_gff3_derives_from($feature, $tags['Derives_from'][0], $organism); 00725 } 00726 // add in the GFF3_source dbxref so that GBrowse can find the feature using the source column 00727 $source_ref = array('GFF_source:' . $source); 00728 tripal_feature_load_gff3_dbxref($feature, $source_ref); 00729 // add any additional attributes 00730 if ($attr_others) { 00731 foreach ($attr_others as $tag_name => $values) { 00732 foreach ($values as $value) { 00733 tripal_feature_load_gff3_property($feature, $tag_name, $value); 00734 } 00735 } 00736 } 00737 00738 } 00739 } 00740 } 00741 00742 if (!$remove) { 00743 print "\nSetting ranks of children...\n"; 00744 00745 // get features in a relationship that are also children of an alignment 00746 $sql = "SELECT DISTINCT F.feature_id, F.organism_id, F.type_id, 00747 F.uniquename, FL.strand 00748 FROM tripal_gff_temp TGT 00749 INNER JOIN feature F ON TGT.feature_id = F.feature_id 00750 INNER JOIN feature_relationship FR ON FR.object_id = TGT.feature_id 00751 INNER JOIN cvterm CVT ON CVT.cvterm_id = FR.type_id 00752 INNER JOIN featureloc FL ON FL.feature_id = F.feature_id 00753 WHERE CVT.name = 'part_of'"; 00754 $parents = chado_query($sql); 00755 00756 // build and prepare the SQL for selecting the children relationship 00757 $sql = "SELECT DISTINCT FR.feature_relationship_id, FL.fmin, FR.rank 00758 FROM feature_relationship FR 00759 INNER JOIN featureloc FL on FL.feature_id = FR.subject_id"; 00760 if (!$connection) { 00761 $sql .= "WHERE FR.object_id = %d ". 00762 "ORDER BY FL.fmin ASC "; 00763 } 00764 else { 00765 $sql = "PREPARE sel_gffchildren (int) AS " . $sql . " WHERE FR.object_id = \$1 ORDER BY FL.fmin ASC"; 00766 } 00767 if (!tripal_core_is_sql_prepared('sel_gffchildren')) { 00768 $success = tripal_core_chado_prepare('sel_gffchildren', $sql, array('int')); 00769 if (!$success) { 00770 watchdog("T_gff3_loader", "Cannot prepare statement 'sel_gffchildren' and cannot set children ranks.", 00771 array(), WATCHDOG_WARNING); 00772 return 0; 00773 } 00774 } 00775 00776 // now set the rank of any parent/child relationships. The order is based 00777 // on the fmin. The start rank is 1. This allows features with other 00778 // relationships to be '0' (the default), and doesn't interfer with the 00779 // ordering defined here. 00780 while ($parent = db_fetch_object($parents)) { 00781 00782 // get the children 00783 if ($connection) { 00784 $result = chado_query('EXECUTE sel_gffchildren (%d)', $parent->feature_id); 00785 } 00786 else { 00787 $result = chado_query($sql, $parent->feature_id); 00788 } 00789 00790 // build an array of the children 00791 $children = array(); 00792 while ($child = db_fetch_object($result)) { 00793 $children[] = $child; 00794 } 00795 00796 // the children list comes sorted in ascending fmin 00797 // but if the parent is on the reverse strand we need to 00798 // reverse the order of the children. 00799 if ($parent->strand == -1) { 00800 arsort($children); 00801 } 00802 00803 // first set the ranks to a negative number so that we don't 00804 // get a duplicate error message when we try to change any of them 00805 $rank = -1; 00806 foreach ($children as $child) { 00807 $match = array('feature_relationship_id' => $child->feature_relationship_id); 00808 $options = array('statement_name' => 'upd_featurerelationship_rank'); 00809 $values = array('rank' => $rank); 00810 tripal_core_chado_update('feature_relationship', $match, $values, $options); 00811 $rank--; 00812 } 00813 // now set the rank correctly. The rank should start at 0. 00814 $rank = 0; 00815 foreach ($children as $child) { 00816 $match = array('feature_relationship_id' => $child->feature_relationship_id); 00817 $options = array('statement_name' => 'upd_featurerelationship_rank'); 00818 $values = array('rank' => $rank); 00819 //print "Was: " . $child->rank . " now $rank ($parent->strand)\n" ; 00820 tripal_core_chado_update('feature_relationship', $match, $values, $options); 00821 $rank++; 00822 } 00823 } 00824 } 00825 00826 // commit the transaction 00827 if ($use_transaction) { 00828 tripal_db_commit_transaction(); 00829 } 00830 print "Done\n"; 00831 00832 return 1; 00833 } 00839 function tripal_feature_load_gff3_derives_from($feature, $subject, $organism) { 00840 00841 // get the subject type 00842 $values = array( 00843 'organism_id' => $organism->organism_id, 00844 'uniquename' => $subject, 00845 ); 00846 $options = array('statement_name' => 'sel_tripalgfftemp_orun'); 00847 $result = tripal_core_chado_select('tripal_gff_temp', array('type_name'), $values, $options); 00848 if (count($result) == 0) { 00849 watchdog("T_gff3_loader", "Cannot find subject type for feature in 'derives_from' relationship: %subject", array('%subject' => $subject), WATCHDOG_WARNING); 00850 return ''; 00851 } 00852 $subject_type = $result[0]->type_name; 00853 00854 // get the subject feature 00855 $match = array( 00856 'organism_id' => $organism->organism_id, 00857 'uniquename' => $subject, 00858 'type_id' => array( 00859 'name' => $subject_type, 00860 'cv_id' => array( 00861 'name' => 'sequence' 00862 ), 00863 ), 00864 ); 00865 $options = array('statement_name' => 'sel_feature_orunty'); 00866 $sfeature = tripal_core_chado_select('feature', array('feature_id'), $match, $options); 00867 if (count($sfeature)==0) { 00868 watchdog('T_gff3_loader', "Could not add 'Derives_from' relationship ". 00869 "for %uniquename and %subject. Subject feature, '%subject', ". 00870 "cannot be found", array('%uniquename' => $feature->uniquename, '%subject' => $subject), WATCHDOG_ERROR); 00871 return; 00872 } 00873 00874 // now check to see if the relationship already exists 00875 $values = array( 00876 'object_id' => $sfeature[0]->feature_id, 00877 'subject_id' => $feature->feature_id, 00878 'type_id' => array( 00879 'cv_id' => array( 00880 'name' => 'relationship' 00881 ), 00882 'name' => 'derives_from', 00883 ), 00884 'rank' => 0 00885 ); 00886 $options = array('statement_name' => 'sel_featurerelationship_objectid_subjectid_typeid_rank'); 00887 $rel = tripal_core_chado_select('feature_relationship', array('*'), $values, $options); 00888 if (count($rel) > 0) { 00889 return; 00890 } 00891 00892 // finally insert the relationship if it doesn't exist 00893 $options = array('statement_name' => 'ins_featurerelationship_objectid_subjectid_typeid_rank'); 00894 $ret = tripal_core_chado_insert('feature_relationship', $values, $options); 00895 if (!$ret) { 00896 watchdog("T_gff3_loader", "Could not add 'Derives_from' relationship for $feature->uniquename and $subject", 00897 array(), WATCHDOG_WARNING); 00898 } 00899 } 00905 function tripal_feature_load_gff3_parents($feature, $cvterm, $parents, $organism_id, $fmin) { 00906 00907 $uname = $feature->uniquename; 00908 $type = $cvterm->name; 00909 $rel_type = 'part_of'; 00910 00911 // prepare these SQL statements that will be used repeatedly. 00912 if (!tripal_core_is_sql_prepared('sel_cvterm_cvname_cvtname_synonym')) { 00913 $psql = "PREPARE sel_cvterm_cvname_cvtname_synonym (text, text, text) AS 00914 SELECT CVT.cvterm_id 00915 FROM cvterm CVT 00916 INNER JOIN cv CV on CVT.cv_id = CV.cv_id 00917 LEFT JOIN cvtermsynonym CVTS on CVTS.cvterm_id = CVT.cvterm_id 00918 WHERE cv.name = $1 and (CVT.name = $2 or CVTS.synonym = $3)"; 00919 $status = tripal_core_chado_prepare('sel_cvterm_cvname_cvtname_synonym', $psql, array('text', 'text' ,'text')); 00920 if (!$status) { 00921 watchdog("T_gff3_loader", "Cannot prepare statement 'sel_cvterm_cvname_cvtname_synonym' for ontology term", 00922 array(), WATCHDOG_WARNING); 00923 return ''; 00924 } 00925 } 00926 00927 // iterate through the parents in the list 00928 foreach ($parents as $parent) { 00929 // get the parent cvterm 00930 $values = array( 00931 'organism_id' => $organism_id, 00932 'uniquename' => $parent, 00933 ); 00934 $options = array('statement_name' => 'sel_tripalgfftemp_orun'); 00935 $result = tripal_core_chado_select('tripal_gff_temp', array('type_name'), $values, $options); 00936 if (count($result) == 0) { 00937 watchdog("T_gff3_loader", "Cannot find parent: %parent", array('%parent' => $parent), WATCHDOG_WARNING); 00938 return ''; 00939 } 00940 $parent_type = $result[0]->type_name; 00941 00942 // try to find the parent 00943 $parentcvterm = db_fetch_object(chado_query("EXECUTE sel_cvterm_cvname_cvtname_synonym ('%s', '%s', '%s')", 'sequence', $parent_type, $parent_type)); 00944 $relcvterm = db_fetch_object(chado_query("EXECUTE sel_cvterm_cvname_cvtname_synonym ('%s', '%s', '%s')", 'relationship', $rel_type, $rel_type)); 00945 $values = array( 00946 'organism_id' => $organism_id, 00947 'uniquename' => $parent, 00948 'type_id' => $parentcvterm->cvterm_id, 00949 ); 00950 $options = array('statement_name' => 'sel_feature_orunty'); 00951 $result = tripal_core_chado_select('feature', array('feature_id'), $values, $options); 00952 $parent_feature = $result[0]; 00953 00954 // if the parent exists then add the relationship otherwise print error and skip 00955 if ($parent_feature) { 00956 00957 // check to see if the relationship already exists 00958 $values = array( 00959 'object_id' => $parent_feature->feature_id, 00960 'subject_id' => $feature->feature_id, 00961 'type_id' => $relcvterm->cvterm_id, 00962 ); 00963 $options = array('statement_name' => 'sel_featurerelationship_objectid_subjectid_typeid'); 00964 $rel = tripal_core_chado_select('feature_relationship', array('*'), $values, $options); 00965 00966 if (count($rel) > 0) { 00967 } 00968 else { 00969 // the relationship doesn't already exist, so add it. 00970 $values = array( 00971 'subject_id' => $feature->feature_id, 00972 'object_id' => $parent_feature->feature_id, 00973 'type_id' => $relcvterm->cvterm_id, 00974 ); 00975 $options = array('statement_name' => 'ins_featurerelationship_subjectid_objectid_typeid'); 00976 $result = tripal_core_chado_insert('feature_relationship', $values, $options); 00977 if (!$result) { 00978 watchdog("T_gff3_loader", "Failed to insert feature relationship '$uname' ($type) $rel_type '$parent' ($parent_type)", 00979 array(), WATCHDOG_WARNING); 00980 } 00981 } 00982 } 00983 else { 00984 watchdog("T_gff3_loader", "Cannot establish relationship '$uname' ($type) $rel_type '$parent' ($parent_type): Cannot find the parent", 00985 array(), WATCHDOG_WARNING); 00986 } 00987 } 00988 } 00989 00996 function tripal_feature_load_gff3_dbxref($feature, $dbxrefs) { 00997 00998 // iterate through each of the dbxrefs 00999 foreach ($dbxrefs as $dbxref) { 01000 01001 // get the database name from the reference. If it doesn't exist then create one. 01002 $ref = explode(":", $dbxref); 01003 $dbname = $ref[0]; 01004 $accession = $ref[1]; 01005 01006 // first look for the database name if it doesn't exist then create one. 01007 // first check for the fully qualified URI (e.g. DB:<dbname>. If that 01008 // can't be found then look for the name as is. If it still can't be found 01009 // the create the database 01010 $values = array('name' => "DB:$dbname"); 01011 $options = array('statement_name' => 'sel_db_name'); 01012 $db = tripal_core_chado_select('db', array('db_id'), $values, $options); 01013 if (count($db) == 0) { 01014 $values = array('name' => "$dbname"); 01015 $db = tripal_core_chado_select('db', array('db_id'), $values, $options); 01016 } 01017 if (count($db) == 0) { 01018 $values = array( 01019 'name' => $dbname, 01020 'description' => 'Added automatically by the GFF loader' 01021 ); 01022 $options = array('statement_name' => 'ins_db_name'); 01023 $success = tripal_core_chado_insert('db', $values, $options); 01024 if ($success) { 01025 $values = array('name' => "$dbname"); 01026 $options = array('statement_name' => 'sel_db_name'); 01027 $db = tripal_core_chado_select('db', array('db_id'), $values, $options); 01028 } 01029 else { 01030 watchdog("T_gff3_loader", "Cannot find or add the database $dbname", array(), WATCHDOG_WARNING); 01031 return 0; 01032 } 01033 } 01034 $db = $db[0]; 01035 01036 // now check to see if the accession exists 01037 $values = array( 01038 'accession' => $accession, 01039 'db_id' => $db->db_id 01040 ); 01041 $options = array('statement_name' => 'sel_dbxref_accession_dbid'); 01042 $dbxref = tripal_core_chado_select('dbxref', array('dbxref_id'), $values, $options); 01043 01044 // if the accession doesn't exist then we want to add it 01045 if (sizeof($dbxref) == 0) { 01046 $values = array( 01047 'db_id' => $db->db_id, 01048 'accession' => $accession, 01049 'version' => '' 01050 ); 01051 $options = array('statement_name' => 'ins_dbxref_dbid_accession_version'); 01052 $ret = tripal_core_chado_insert('dbxref', $values, $options); 01053 $values = array( 01054 'accession' => $accession, 01055 'db_id' => $db->db_id 01056 ); 01057 $options = array('statement_name' => 'sel_dbxref_accession_dbid'); 01058 $dbxref = tripal_core_chado_select('dbxref', array('dbxref_id'), $values, $options); 01059 } 01060 $dbxref = $dbxref[0]; 01061 01062 // check to see if this feature dbxref already exists 01063 $values = array( 01064 'dbxref_id' => $dbxref->dbxref_id, 01065 'feature_id' => $feature->feature_id 01066 ); 01067 $options = array('statement_name' => 'sel_featuredbxref_dbxrefid_featureid'); 01068 $fdbx = tripal_core_chado_select('feature_dbxref', array('feature_dbxref_id'), $values, $options); 01069 01070 // now associate this feature with the database reference if it doesn't 01071 // already exist 01072 if (sizeof($fdbx) == 0) { 01073 $values = array( 01074 'dbxref_id' => $dbxref->dbxref_id, 01075 'feature_id' => $feature->feature_id 01076 ); 01077 $options = array('statement_name' => 'ins_featuredbxref_dbxrefid_featureid'); 01078 $success = tripal_core_chado_insert('feature_dbxref', $values, $options); 01079 if (!$success) { 01080 watchdog("T_gff3_loader", "Failed to insert Dbxref: $dbname:$accession", array(), WATCHDOG_WARNING); 01081 return 0; 01082 } 01083 } 01084 } 01085 return 1; 01086 } 01092 function tripal_feature_load_gff3_ontology($feature, $dbxrefs) { 01093 01094 // iterate through each of the dbxrefs 01095 foreach ($dbxrefs as $dbxref) { 01096 01097 // get the database name from the reference. If it doesn't exist then create one. 01098 $ref = explode(":", $dbxref); 01099 $dbname = $ref[0]; 01100 $accession = $ref[1]; 01101 01102 // first look for the database name 01103 $options = array('statement_name' => 'sel_db_name'); 01104 $db = tripal_core_chado_select('db', array('db_id'), array('name' => "DB:$dbname"), $options); 01105 if (sizeof($db) == 0) { 01106 // now look for the name without the 'DB:' prefix. 01107 $db = tripal_core_chado_select('db', array('db_id'), array('name' => "$dbname"), $options); 01108 if (sizeof($db) == 0) { 01109 watchdog("T_gff3_loader", "Database, $dbname, is not present. Cannot associate term: $dbname:$accession", array(), WATCHDOG_WARNING); 01110 return 0; 01111 } 01112 } 01113 $db = $db[0]; 01114 01115 // now check to see if the accession exists 01116 $options = array('statement_name' => 'sel_dbxref_accession_dbid'); 01117 $dbxref = tripal_core_chado_select('dbxref', array('dbxref_id'), 01118 array('accession' => $accession, 'db_id' => $db->db_id), $options); 01119 if (sizeof($dbxref) == 0) { 01120 watchdog("T_gff3_loader", "Accession, $accession is missing for reference: $dbname:$accession", array(), WATCHDOG_WARNING); 01121 return 0; 01122 } 01123 $dbxref = $dbxref[0]; 01124 01125 // now check to see if the cvterm exists 01126 $options = array('statement_name' => 'sel_cvterm_dbxrefid'); 01127 $cvterm = tripal_core_chado_select('cvterm', array('cvterm_id'), array( 01128 'dbxref_id' => $dbxref->dbxref_id), $options); 01129 // if it doesn't exist in the cvterm table, look for an alternate id 01130 if (sizeof($cvterm) == 0) { 01131 $options = array('statement_name' => 'sel_cvtermdbxref_dbxrefid'); 01132 $cvterm = tripal_core_chado_select('cvterm_dbxref', array('cvterm_id'), array( 01133 'dbxref_id' => $dbxref->dbxref_id), $options); 01134 if (sizeof($cvterm) == 0) { 01135 watchdog("T_gff3_loader", "CV Term is missing for reference: $dbname:$accession", array(), WATCHDOG_WARNING); 01136 return 0; 01137 } 01138 } 01139 $cvterm = $cvterm[0]; 01140 01141 01142 // check to see if this feature cvterm already exists 01143 $options = array('statement_name' => 'sel_featurecvterm_cvtermid_featureid'); 01144 $fcvt = tripal_core_chado_select('feature_cvterm', array('feature_cvterm_id'), 01145 array('cvterm_id' => $cvterm->cvterm_id, 'feature_id' => $feature->feature_id), 01146 $options); 01147 01148 // now associate this feature with the cvterm if it doesn't already exist 01149 if (sizeof($fcvt)==0) { 01150 $values = array( 01151 'cvterm_id' => $cvterm->cvterm_id, 01152 'feature_id' => $feature->feature_id, 01153 'pub_id' => array( 01154 'uniquename' => 'null', 01155 ), 01156 ); 01157 $options = array('statement_name' => 'ins_featurecvterm_cvtermid_featureid_pubid'); 01158 $success = tripal_core_chado_insert('feature_cvterm', $values, $options); 01159 01160 if (!$success) { 01161 watchdog("T_gff3_loader", "Failed to insert ontology term: $dbname:$accession", array(), WATCHDOG_WARNING); 01162 return 0; 01163 } 01164 } 01165 } 01166 return 1; 01167 } 01173 function tripal_feature_load_gff3_alias($feature, $aliases) { 01174 01175 // make sure we have a 'synonym_type' vocabulary 01176 $select = array('name' => 'synonym_type'); 01177 $options = array('statement_name' => 'sel_cv_name'); 01178 $results = tripal_core_chado_select('cv', array('*'), $select, $options); 01179 01180 if (count($results) == 0) { 01181 // insert the 'synonym_type' vocabulary 01182 $values = array( 01183 'name' => 'synonym_type', 01184 'definition' => 'vocabulary for synonym types', 01185 ); 01186 $options = array('statement_name' => 'ins_cv_name_definition'); 01187 $success = tripal_core_chado_insert('cv', $values, $options); 01188 if (!$success) { 01189 watchdog("T_gff3_loader", "Failed to add the synonyms type vocabulary", array(), WATCHDOG_WARNING); 01190 return 0; 01191 } 01192 // now that we've added the cv we need to get the record 01193 $options = array('statement_name' => 'sel_cv_name'); 01194 $results = tripal_core_chado_select('cv', array('*'), $select, $options); 01195 if (count($results) > 0) { 01196 $syncv = $results[0]; 01197 } 01198 } 01199 else { 01200 $syncv = $results[0]; 01201 } 01202 01203 // get the 'exact' cvterm, which is the type of synonym we're adding 01204 $select = array( 01205 'name' => 'exact', 01206 'cv_id' => array( 01207 'name' => 'synonym_type' 01208 ), 01209 ); 01210 $options = array('statement_name' => 'sel_cvterm_name_cvid'); 01211 $result = tripal_core_chado_select('cvterm', array('*'), $select, $options); 01212 if (count($result) == 0) { 01213 $term = array( 01214 'name' => 'exact', 01215 'id' => "internal:exact", 01216 'definition' => '', 01217 'is_obsolete' => 0, 01218 ); 01219 // TODO: fix the function so it uses prepared statements 01220 $syntype = tripal_cv_add_cvterm($term, $syncv->name, 0, 1); 01221 if (!$syntype) { 01222 watchdog("T_gff3_loader", "Cannot add synonym type: internal:$type", array(), WATCHDOG_WARNING); 01223 return 0; 01224 } 01225 } 01226 else { 01227 $syntype = $result[0]; 01228 } 01229 01230 // iterate through all of the aliases and add each one 01231 foreach ($aliases as $alias) { 01232 01233 // check to see if the alias already exists in the synonym table 01234 // if not, then add it 01235 $select = array( 01236 'name' => $alias, 01237 'type_id' => $syntype->cvterm_id, 01238 ); 01239 $options = array('statement_name' => 'sel_synonym_name_typeid'); 01240 $result = tripal_core_chado_select('synonym', array('*'), $select, $options); 01241 if (count($result) == 0) { 01242 $values = array( 01243 'name' => $alias, 01244 'type_id' => $syntype->cvterm_id, 01245 'synonym_sgml' => '', 01246 ); 01247 $options = array('statement_name' => 'ins_synonym_name_typeid_synonymsgml'); 01248 $success = tripal_core_chado_insert('synonym', $values, $options); 01249 if (!$success) { 01250 watchdog("T_gff3_loader", "Cannot add alias $alias to synonym table", array(), WATCHDOG_WARNING); 01251 return 0; 01252 } 01253 $options = array('statement_name' => 'sel_synonym_name_typeid'); 01254 $result = tripal_core_chado_select('synonym', array('*'), $select, $options); 01255 $synonym = $result[0]; 01256 } 01257 else { 01258 $synonym = $result[0]; 01259 } 01260 01261 // check to see if we have a NULL publication in the pub table. If not, 01262 // then add one. 01263 // @coder-ignore: non-drupal schema thus table prefixing does not apply 01264 $select = array('uniquename' => 'null'); 01265 $options = array('statement_name' => 'sel_pub_uniquename'); 01266 $result = tripal_core_chado_select('pub', array('*'), $select, $options); 01267 if (count($result) == 0) { 01268 // prepare the statement 01269 if (!tripal_core_is_sql_prepared('ins_pub_uniquename_typeid')) { 01270 $psql = "PREPARE ins_pub_uniquename_typeid (text, text) AS 01271 INSERT INTO pub (uniquename,type_id) VALUES ('%s', 01272 (SELECT cvterm_id 01273 FROM cvterm CVT 01274 INNER JOIN dbxref DBX on DBX.dbxref_id = CVT.dbxref_id 01275 INNER JOIN db DB on DB.db_id = DBX.db_id 01276 WHERE CVT.name = $1 and DB.name = $2)"; 01277 $status = tripal_core_chado_prepare('ins_pub_uniquename_typeid', $psql, args('text', 'text')); 01278 if (!$status) { 01279 watchdog("T_gff3_loader", "Cannot prepare statement 'ins_pub_uniquename_typeid", array(), WATCHDOG_WARNING); 01280 return 0; 01281 } 01282 } 01283 // insert the null pub 01284 $result = db_fetch_object(chado_query("EXECUTE ins_pub_uniquename_typeid ('%s', '%s')", 'null', 'null')); 01285 if (!$result) { 01286 watchdog("T_gff3_loader", "Cannot add null publication needed for setup of alias", array(), WATCHDOG_WARNING); 01287 return 0; 01288 } 01289 $options = array('statement_name' => 'sel_pub_uniquename'); 01290 $result = tripal_core_chado_select('pub', array('*'), $select, $options); 01291 $pub = $result[0]; 01292 } 01293 else { 01294 $pub = $result[0]; 01295 } 01296 01297 // check to see if the synonym exists in the feature_synonym table 01298 // if not, then add it. 01299 $values = array( 01300 'synonym_id' => $synonym->synonym_id, 01301 'feature_id' => $feature->feature_id, 01302 'pub_id' => $pub->pub_id, 01303 ); 01304 $columns = array('feature_synonym_id'); 01305 $options = array('statement_name' => 'sel_featuresynonym_syfepu'); 01306 $result = tripal_core_chado_select('feature_synonym', $columns, $values, $options); 01307 if (count($result) == 0) { 01308 $values = array( 01309 'synonym_id' => $synonym->synonym_id, 01310 'feature_id' => $feature->feature_id, 01311 'pub_id' => $pub->pub_id, 01312 ); 01313 $ins_options = array('statement_name' => 'ins_featuresynonym_syfepu'); 01314 $success = tripal_core_chado_insert('feature_synonym', $values, $ins_options); 01315 01316 if (!$success) { 01317 watchdog("T_gff3_loader", "Cannot add alias $alias to feature synonym table", array(), WATCHDOG_WARNING); 01318 return 0; 01319 } 01320 } 01321 } 01322 return 1; 01323 } 01324 01330 function tripal_feature_load_gff3_feature($organism, $analysis_id, $cvterm, $uniquename, 01331 $name, $residues, $is_analysis = 'f', $is_obsolete = 'f', $add_only, $score) { 01332 01333 // check to see if the feature already exists 01334 $feature = NULL; 01335 $fselect = array( 01336 'organism_id' => $organism->organism_id, 01337 'uniquename' => $uniquename, 01338 'type_id' => $cvterm->cvterm_id 01339 ); 01340 $options = array('statement_name' => 'sel_feature_orunty'); 01341 $columns = array('feature_id', 'name', 'uniquename', 'seqlen', 'organism_id', 'type_id'); 01342 $result = tripal_core_chado_select('feature', $columns, $fselect, $options); 01343 if (count($result) > 0) { 01344 $feature = $result[0]; 01345 } 01346 01347 if (strcmp($is_obsolete, 'f')==0 or $is_obsolete == 0) { 01348 $is_obsolete = 'FALSE'; 01349 } 01350 if (strcmp($is_obsolete, 't')==0 or $is_obsolete == 1) { 01351 $is_obsolete = 'TRUE'; 01352 } 01353 if (strcmp($is_analysis, 'f')==0 or $is_analysis == 0) { 01354 $is_analysis = 'FALSE'; 01355 } 01356 if (strcmp($is_analysis, 't')==0 or $is_analysis == 1) { 01357 $is_analysis = 'TRUE'; 01358 } 01359 01360 // insert the feature if it does not exist otherwise perform an update 01361 if (!$feature) { 01362 $values = array( 01363 'organism_id' => $organism->organism_id, 01364 'name' => $name, 01365 'uniquename' => $uniquename, 01366 // 'residues' => $residues, 01367 // 'seqlen' => drupal_strlen($residues), 01368 'md5checksum' => md5($residues), 01369 'type_id' => $cvterm->cvterm_id, 01370 'is_analysis' => $is_analysis, 01371 'is_obsolete' => $is_obsolete, 01372 ); 01373 $options = array('statement_name' => 'ins_feature_all'); 01374 $result = tripal_core_chado_insert('feature', $values, $options); 01375 if (!$result) { 01376 watchdog("T_gff3_loader", "Failed to insert feature '$uniquename' ($cvterm->name)", array(), WATCHDOG_WARNING); 01377 return 0; 01378 } 01379 } 01380 elseif (!$add_only) { 01381 $values = array( 01382 'name' => $name, 01383 // 'residues' => $residues, 01384 // 'seqlen' => drupal_strlen($residues), 01385 'md5checksum' => md5($residues), 01386 'is_analysis' => $is_analysis, 01387 'is_obsolete' => $is_obsolete, 01388 ); 01389 $match = array( 01390 'organism_id' => $organism->organism_id, 01391 'uniquename' => $uniquename, 01392 'type_id' => $cvterm->cvterm_id, 01393 ); 01394 $options = array('statement_name' => 'upd_feature'); 01395 $result = tripal_core_chado_update('feature', $match, $values, $options); 01396 if (!$result) { 01397 watchdog("T_gff3_loader", "Failed to update feature '$uniquename' ($cvterm->name)", array(), WATCHDOG_WARNING); 01398 return 0; 01399 } 01400 } 01401 else { 01402 // the feature exists and we don't want to update it so return 01403 // a value of 0. This will stop all downstream property additions 01404 return 0; 01405 } 01406 01407 // get the newly added feature 01408 $options = array('statement_name' => 'sel_feature_orunty'); 01409 $columns = array('feature_id', 'name', 'uniquename', 'seqlen', 'organism_id', 'type_id'); 01410 $result = tripal_core_chado_select('feature', $columns, $fselect, $options); 01411 $feature = $result[0]; 01412 01413 // add the analysisfeature entry to the analysisfeature table if it doesn't already exist 01414 $af_values = array( 01415 'analysis_id' => $analysis_id, 01416 'feature_id' => $feature->feature_id 01417 ); 01418 $options = array('statement_name' => 'sel_analysisfeature_analysisid_featureid'); 01419 $afeature = tripal_core_chado_select('analysisfeature', array('analysisfeature_id'), $af_values, $options); 01420 if (count($afeature)==0) { 01421 // if a score is available then set that to be the significance field 01422 if (strcmp($score, '.') != 0) { 01423 $af_values['significance'] = $score; 01424 $options = array('statement_name' => 'ins_analysisfeature_analysisid_featureid_significance'); 01425 } 01426 else { 01427 $options = array('statement_name' => 'ins_analysisfeature_analysisid_featureid'); 01428 } 01429 if (!tripal_core_chado_insert('analysisfeature', $af_values, $options)) { 01430 watchdog("T_gff3_loader", "Could not add analysisfeature record: $analysis_id, $feature->feature_id", array(), WATCHDOG_WARNING); 01431 } 01432 } 01433 else { 01434 // if a score is available then set that to be the significance field 01435 $new_vals = array(); 01436 if (strcmp($score, '.')!=0) { 01437 $new_vals['significance'] = $score; 01438 } 01439 else { 01440 $new_vals['significance'] = '__NULL__'; 01441 } 01442 if (!$add_only) { 01443 $options = array('statement_name' => 'upd_analysisfeature'); 01444 $ret = tripal_core_chado_update('analysisfeature', $af_values, $new_vals, $options); 01445 if (!$ret) { 01446 watchdog("T_gff3_loader", "Could not update analysisfeature record: $analysis_id, $feature->feature_id", array(), WATCHDOG_WARNING); 01447 } 01448 } 01449 } 01450 return $feature; 01451 } 01452 01458 function tripal_feature_load_gff3_featureloc($feature, $organism, $landmark, $fmin, 01459 $fmax, $strand, $phase, $is_fmin_partial, $is_fmax_partial, $residue_info, $locgroup, 01460 $landmark_type_id = '', $landmark_organism_id = '', $create_landmark = 0, 01461 $landmark_is_target = 0) { 01462 01463 $select = array( 01464 'organism_id' => $landmark_organism_id ? $landmark_organism_id : $organism->organism_id, 01465 'uniquename' => $landmark, 01466 ); 01467 $options = array('statement_name' => 'sel_feature_orun'); 01468 if ($landmark_type_id) { 01469 $select['type_id'] = $landmark_type_id; 01470 $options = array('statement_name' => 'sel_feature_orunty'); 01471 } 01472 $results = tripal_core_chado_select('feature', array('feature_id'), $select, $options); 01473 01474 $srcfeature = ''; 01475 if (count($results)==0) { 01476 // so we couldn't find the landmark using the uniquename. Let's try the 'name'. 01477 // if we return only a single result then we can proceed. Otherwise give an 01478 $select = array( 01479 'organism_id' => $landmark_organism_id ? $landmark_organism_id : $organism->organism_id, 01480 'name' => $landmark, 01481 ); 01482 $options = array('statement_name' => 'sel_feature_orna'); 01483 if ($landmark_type_id) { 01484 $select['type_id'] = $landmark_type_id; 01485 $options = array('statement_name' => 'sel_feature_ornaty'); 01486 } 01487 $results = tripal_core_chado_select('feature', array('feature_id'), $select, $options); 01488 if (count($results) == 0) { 01489 // if the landmark is the target feature in a matched alignment then try one more time to 01490 // find it by querying any feature with the same uniquename. If we find one then use it. 01491 if ($landmark_is_target) { 01492 $select = array('uniquename' => $landmark); 01493 $options = array('statement_name' => 'sel_feature_un'); 01494 $results = tripal_core_chado_select('feature', array('feature_id'), $select, $options); 01495 if (count($results) == 1) { 01496 $srcfeature = $results[0]; 01497 } 01498 } 01499 01500 if (!$srcfeature) { 01501 // we couldn't find the landmark feature, so if the user has requested we create it then do so 01502 // but only if we have a type id 01503 if ($create_landmark and $landmark_type_id) { 01504 $values = array( 01505 'organism_id' => $landmark_organism_id ? $landmark_organism_id : $organism->organism_id, 01506 'name' => $landmark, 01507 'uniquename' => $landmark, 01508 'type_id' => $landmark_type_id 01509 ); 01510 $options = array('statement_name' => 'ins_feature_ornaunty'); 01511 $results = tripal_core_chado_insert('feature', $values, $options); 01512 if (!$results) { 01513 watchdog("T_gff3_loader", "Cannot find landmark feature: '%landmark', nor could it be inserted", 01514 array('%landmark' => $landmark), WATCHDOG_WARNING); 01515 return 0; 01516 } 01517 $srcfeature = new stdClass(); 01518 $srcfeature->feature_id = $results['feature_id']; 01519 } 01520 else { 01521 watchdog("T_gff3_loader", "Cannot find unique landmark feature: '%landmark'.", 01522 array('%landmark' => $landmark), WATCHDOG_WARNING); 01523 return 0; 01524 } 01525 } 01526 } 01527 elseif (count($results) > 1) { 01528 watchdog("T_gff3_loader", "multiple landmarks exist with the name: '%landmark'. Cannot 01529 resolve which one to use. Cannot add the feature location record", 01530 array('%landmark' => $landmark), WATCHDOG_WARNING); 01531 return 0; 01532 } 01533 else { 01534 $srcfeature = $results[0]; 01535 } 01536 } 01537 elseif (count($results) > 1) { 01538 watchdog("T_gff3_loader", "multiple landmarks exist with the name: '%landmark'. Cannot 01539 resolve which one to use. Cannot add the feature location record", 01540 array('%landmark' => $landmark), WATCHDOG_WARNING); 01541 return 0; 01542 } 01543 else { 01544 $srcfeature = $results[0]; 01545 } 01546 01547 // TODO: create an attribute that recognizes the residue_info,locgroup, 01548 // is_fmin_partial and is_fmax_partial, right now these are 01549 // hardcoded to be false and 0 below. 01550 01551 01552 // check to see if this featureloc already exists, but also keep track of the 01553 // last rank value 01554 $rank = 0; 01555 $exists = 0; 01556 $select = array('feature_id' => $feature->feature_id); 01557 $options = array( 01558 'statement_name' => 'sel_featureloc_fe', 01559 'order_by' => array( 01560 'rank' => 'ASC' 01561 ), 01562 ); 01563 $locrecs = tripal_core_chado_select('featureloc', array('*'), $select, $options); 01564 01565 foreach ($locrecs as $featureloc) { 01566 // it is possible for the featureloc->srcfeature_id to be NULL. This can happen if the srcfeature 01567 // is not known (according to chado table field descriptions). If it's null then just skip this entry 01568 if (!$featureloc->srcfeature_id) { 01569 continue; 01570 } 01571 $select = array('feature_id' => $featureloc->srcfeature_id); 01572 $options = array('statement_name' => 'sel_feature_fe'); 01573 $columns = array('feature_id', 'name'); 01574 $locsfeature = tripal_core_chado_select('feature', $columns, $select, $options); 01575 01576 // the source feature name and at least the fmin and fmax must be the same 01577 // for an update of the featureloc, otherwise we'll insert a new record. 01578 if (strcmp($locsfeature[0]->name, $landmark)==0 and 01579 ($featureloc->fmin == $fmin or $featureloc->fmax == $fmax)) { 01580 $match = array('featureloc_id' => $featureloc->featureloc_id); 01581 $values = array(); 01582 $exists = 1; 01583 if ($featureloc->fmin != $fmin) { 01584 $values['fmin'] = $fmin; 01585 } 01586 if ($featureloc->fmax != $fmax) { 01587 $values['fmax'] = $fmax; 01588 } 01589 if ($featureloc->strand != $strand) { 01590 $values['strand'] = $strand; 01591 } 01592 if (count($values) > 0) { 01593 $options = array('statement_name' => 'upd_featureloc_all'); 01594 tripal_core_chado_update('featureloc', $match, $values, $options); 01595 } 01596 } 01597 $rank = $featureloc->rank + 1; 01598 } 01599 if (!$exists) { 01600 01601 // this feature location is new so add it 01602 if (strcmp($is_fmin_partial, 'f')==0 or !$is_fmin_partial) { 01603 $is_fmin_partial = 'FALSE'; 01604 } 01605 elseif (strcmp($is_fmin_partial, 't')==0 or $is_fmin_partial = 1) { 01606 $is_fmin_partial = 'TRUE'; 01607 } 01608 if (strcmp($is_fmax_partial, 'f')==0 or !$is_fmax_partial) { 01609 $is_fmax_partial = 'FALSE'; 01610 } 01611 elseif (strcmp($is_fmax_partial, 't')==0 or $is_fmax_partial = 1) { 01612 $is_fmax_partial = 'TRUE'; 01613 } 01614 $values = array( 01615 'feature_id' => $feature->feature_id, 01616 'srcfeature_id' => $srcfeature->feature_id, 01617 'fmin' => $fmin, 01618 'is_fmin_partial' => $is_fmin_partial, 01619 'fmax' => $fmax, 01620 'is_fmax_partial' => $is_fmax_partial, 01621 'strand' => $strand, 01622 'residue_info' => $residue_info, 01623 'locgroup' => $locgroup, 01624 'rank' => $rank 01625 ); 01626 $options = array('statement_name' => 'ins_featureloc_all'); 01627 if ($phase) { 01628 $values['phase'] = $phase; 01629 $options = array('statement_name' => 'ins_featureloc_allphase'); 01630 } 01631 $success = tripal_core_chado_insert('featureloc', $values, $options); 01632 if (!$success) { 01633 watchdog("T_gff3_loader", "Failed to insert featureloc", array(), WATCHDOG_WARNING); 01634 exit; 01635 return 0; 01636 } 01637 } 01638 return 1; 01639 } 01645 function tripal_feature_load_gff3_property($feature, $property, $value) { 01646 01647 // first make sure the cvterm exists. if not, then add it 01648 $select = array( 01649 'name' => $property, 01650 'cv_id' => array( 01651 'name' => 'feature_property', 01652 ), 01653 ); 01654 $options = array('statement_name' => 'sel_cvterm_name_cvid'); 01655 $result = tripal_core_chado_select('cvterm', array('*'), $select, $options); 01656 01657 // if we don't have a property like this already, then add it otherwise, just return 01658 if (count($result) == 0) { 01659 $term = array( 01660 'id' => "null:$property", 01661 'name' => $property, 01662 'namespace' => 'feature_property', 01663 'is_obsolete' => 0, 01664 ); 01665 $cvterm = (object) tripal_cv_add_cvterm($term, 'feature_property', 0, 0); 01666 if (!$cvterm) { 01667 watchdog("T_gff3_loader", "Cannot add cvterm, $property", array(), WATCHDOG_WARNING); 01668 return 0; 01669 } 01670 } 01671 else { 01672 $cvterm = $result[0]; 01673 } 01674 01675 01676 // check to see if the property already exists for this feature 01677 // if it does but the value is unique then increment the rank and add it. 01678 // if the value is not unique then don't add it. 01679 $add = 1; 01680 $rank = 0; 01681 $select = array( 01682 'feature_id' => $feature->feature_id, 01683 'type_id' => $cvterm->cvterm_id, 01684 ); 01685 $options = array( 01686 'statement_name' => 'sel_featureprop_featureid_typeid', 01687 'order_by' => array( 01688 'rank' => 'ASC', 01689 ), 01690 ); 01691 $results = tripal_core_chado_select('featureprop', array('*'), $select, $options); 01692 foreach ($results as $prop) { 01693 if (strcmp($prop->value, $value)==0) { 01694 $add = NULL; // don't add it, it already exists 01695 } 01696 $rank = $prop->rank + 1; 01697 } 01698 01699 // add the property if we pass the check above 01700 if ($add) { 01701 $values = array( 01702 'feature_id' => $feature->feature_id, 01703 'type_id' => $cvterm->cvterm_id, 01704 'value' => $value, 01705 'rank' => $rank, 01706 ); 01707 $options = array('statement_name' => 'ins_featureprop_all'); 01708 $result = tripal_core_chado_insert('featureprop', $values, $options); 01709 if (!$result) { 01710 watchdog("T_gff3_loader", "cannot add featureprop, $property", array(), WATCHDOG_WARNING); 01711 } 01712 } 01713 } 01714 01715 /* 01716 * 01717 */ 01718 function tripal_feature_load_gff3_fasta($fh, $interval, &$num_read, &$intv_read, &$line_num) { 01719 print "Loading FASTA sequences\n"; 01720 $residues = ''; 01721 $sql = " 01722 PREPARE sel_gfftemp_un (text) AS 01723 SELECT feature_id FROM tripal_gff_temp 01724 WHERE uniquename = $1 01725 "; 01726 $status = tripal_core_chado_prepare('sel_gfftemp_un', $sql, array('text')); 01727 if (!$status) { 01728 watchdog('T_gff3_loader', 'Cannot prepare statement \'sel_gfftemp_un\'.', 01729 array(), WATCHDOG_ERROR); 01730 return ''; 01731 } 01732 $id = NULL; 01733 01734 // iterate through the remaining lines of the file 01735 while ($line = fgets($fh)) { 01736 01737 $line_num++; 01738 $size = drupal_strlen($line); 01739 $num_read += $size; 01740 $intv_read += $size; 01741 01742 $line = trim($line); 01743 01744 // update the job status every 1% features 01745 if ($job and $intv_read >= $interval) { 01746 $intv_read = 0; 01747 $percent = sprintf("%.2f", ($num_read / $filesize) * 100); 01748 print "Parsing Line $line_num (" . $percent . "%). Memory: " . number_format(memory_get_usage()) . " bytes.\r"; 01749 tripal_job_set_progress($job, intval(($num_read / $filesize) * 100)); 01750 } 01751 01752 // if we encounter a definition line then get the name, uniquename, 01753 // accession and relationship subject from the definition line 01754 if (preg_match('/^>/', $line)) { 01755 // if we are beginning a new sequence then save the last one we 01756 // just finished. 01757 01758 if ($id) { 01759 $sql = "EXECUTE sel_gfftemp_un('%s')"; 01760 $result = tripal_core_chado_execute_prepared('sel_gfftemp_un', $sql, array($id)); 01761 if (!$result) { 01762 watchdog('T_gff3_loader', 'Cannot find feature to assign FASTA sequence: %uname', 01763 array('%uname' => $id), WATCHDOG_WARNING); 01764 } 01765 // if we have a feature then add the residues 01766 else { 01767 $feature = db_fetch_object($result); 01768 $values = array('residues' => $residues); 01769 $match = array('feature_id' => $feature->feature_id); 01770 $options = array('statement_name' => 'upd_feature_re'); 01771 tripal_core_chado_update('feature', $match, $values, $options); 01772 } 01773 } 01774 // get the feature ID for this ID from the tripal_gff_temp table 01775 $id = preg_replace('/^>(.*)$/', '\1', $line); 01776 $residues = ''; 01777 } 01778 else { 01779 $residues .= trim($line); 01780 } 01781 } 01782 // add in the last sequence 01783 $sql = "EXECUTE sel_gfftemp_un('%s')"; 01784 $result = tripal_core_chado_execute_prepared('sel_gfftemp_un', $sql, array($id)); 01785 if (!$result) { 01786 watchdog('T_gff3_loader', 'Cannot find feature to assign FASTA sequence: %uname', 01787 array('%uname' => $id), WATCHDOG_WARNING); 01788 } 01789 // if we have a feature then add the residues 01790 else { 01791 $feature = db_fetch_object($result); 01792 $values = array('residues' => $residues); 01793 $match = array('feature_id' => $feature->feature_id); 01794 $options = array('statement_name' => 'upd_feature_re'); 01795 tripal_core_chado_update('feature', $match, $values, $options); 01796 } 01797 } 01798 01799 /* 01800 * 01801 */ 01802 function tripal_feature_load_gff3_target($feature, $tags, $target_organism_id, $target_type, $create_target, $attr_locgroup) { 01803 // format is: "target_id start end [strand]", where strand is optional and may be "+" or "-" 01804 $matched = preg_match('/^(.*?)\s+(\d+)\s+(\d+)(\s+[\+|\-])*$/', trim($tags['Target'][0]), $matches); 01805 01806 // the organism and type of the target may also be specified as an attribute. If so, then get that 01807 // information 01808 $gff_target_organism = array_key_exists('target_organism', $tags) ? $tags['target_organism'][0] : ''; 01809 $gff_target_type = array_key_exists('target_type', $tags) ? $tags['target_type'][0] : ''; 01810 01811 // if we have matches and the Target is in the correct format then load the alignment 01812 if ($matched) { 01813 $target_feature = $matches[1]; 01814 $start = $matches[2]; 01815 $end = $matches[3]; 01816 // if we have an optional strand, convert it to a numeric value. 01817 if ($matches[4]) { 01818 if (preg_match('/^\+$/', trim($matches[4]))) { 01819 $target_strand = 1; 01820 } 01821 elseif (preg_match('/^\-$/', trim($matches[4]))) { 01822 $target_strand = -1; 01823 } 01824 else { 01825 $target_strand = 0; 01826 } 01827 } 01828 else { 01829 $target_strand = 0; 01830 } 01831 01832 $target_fmin = $start - 1; 01833 $target_fmax = $end; 01834 if ($end < $start) { 01835 $target_fmin = $end - 1; 01836 $target_fmax = $start; 01837 } 01838 01839 // default the target organism to be the value passed into the function, but if the GFF 01840 // file species the target organism then use that instead. 01841 $t_organism_id = $target_organism_id; 01842 if ($gff_target_organism) { 01843 // get the genus and species 01844 $success = preg_match('/^(.*?):(.*?)$/', $gff_target_organism, $matches); 01845 if ($success) { 01846 $values = array( 01847 'genus' => $matches[1], 01848 'species' => $matches[2], 01849 ); 01850 $options = array('statement_name' => 'sel_organism_gesp'); 01851 $torganism = tripal_core_chado_select('organism', array('organism_id'), $values, $options); 01852 if (count($torganism) == 1) { 01853 $t_organism_id = $torganism[0]->organism_id; 01854 } 01855 else { 01856 watchdog('T_gff3_loader', "Cannot find organism for target %target.", 01857 array('%target' => $gff_target_organism), WATCHDOG_WARNING); 01858 $t_organism_id = ''; 01859 } 01860 } 01861 else { 01862 watchdog('T_gff3_loader', "The target_organism attribute is improperly formatted: %target. 01863 It should be target_organism=genus:species.", 01864 array('%target' => $gff_target_organism), WATCHDOG_WARNING); 01865 $t_organism_id = ''; 01866 } 01867 } 01868 01869 // default the target type to be the value passed into the function, but if the GFF file 01870 // species the target type then use that instead 01871 $t_type_id = ''; 01872 if ($target_type) { 01873 $values = array( 01874 'name' => $target_type, 01875 'cv_id' => array( 01876 'name' => 'sequence', 01877 ) 01878 ); 01879 $options = array('statement_name' => 'sel_cvterm_nacv'); 01880 $type = tripal_core_chado_select('cvterm', array('cvterm_id'), $values, $options); 01881 if (count($type) == 1) { 01882 $t_type_id = $type[0]->cvterm_id; 01883 } 01884 else { 01885 watchdog('T_gff3_loader', "The target type does not exist in the sequence ontology: %type. ", 01886 array('%type' => $target_type), WATCHDOG_ERROR); 01887 exit; 01888 } 01889 } 01890 if ($gff_target_type) { 01891 $values = array( 01892 'name' => $gff_target_type, 01893 'cv_id' => array( 01894 'name' => 'sequence', 01895 ) 01896 ); 01897 $options = array('statement_name' => 'sel_cvterm_nacv'); 01898 $type = tripal_core_chado_select('cvterm', array('cvterm_id'), $values, $options); 01899 if (count($type) == 1) { 01900 $t_type_id = $type[0]->cvterm_id; 01901 } 01902 else { 01903 watchdog('T_gff3_loader', "The target_type attribute does not exist in the sequence ontology: %type. ", 01904 array('%type' => $gff_target_type), WATCHDOG_WARNING); 01905 $t_type_id = ''; 01906 } 01907 } 01908 01909 // we want to add a featureloc record that uses the target feature as the srcfeature (landmark) 01910 // and the landmark as the feature. 01911 tripal_feature_load_gff3_featureloc($feature, $organism, $target_feature, $target_fmin, 01912 $target_fmax, $target_strand, $phase, $attr_fmin_partial, $attr_fmax_partial, $attr_residue_info, 01913 $attr_locgroup, $t_type_id, $t_organism_id, $create_target, TRUE); 01914 } 01915 // the target attribute is not correctly formatted 01916 else { 01917 watchdog('T_gff3_loader', "Could not add 'Target' alignment as it is improperly formatted: '%target'", 01918 array('%target' => $tags['Target'][0]), WATCHDOG_ERROR); 01919 } 01920 }