| Trees | Indices | Help |
|---|
|
|
1 # Copyright 2006, 2007 by Peter Cock. All rights reserved.
2 #
3 # This code is part of the Biopython distribution and governed by its
4 # license. Please see the LICENSE file that should have been included
5 # as part of this package.
6
7 """Bio.SeqIO support for the "stockholm" (aka PFAM) file format.
8
9 You were expected to use this module via the Bio.SeqIO functions.
10 This module has now been replaced by Bio.AlignIO.PhylipIO, and is
11 deprecated."""
12
13 import warnings
14 warnings.warn("Bio.SeqIO.StockholmIO is deprecated. You can continue to read" \
15 + " and write 'clustal' files with Bio.SeqIO, but this is now" \
16 + " handled via Bio.AlignIO internally.",
17 DeprecationWarning)
18
19 from Bio.Alphabet import single_letter_alphabet
20 from Bio.Seq import Seq
21 from Bio.SeqRecord import SeqRecord
22 from Interfaces import InterlacedSequenceIterator, SequentialSequenceWriter
23
25 """Loads a Stockholm file from PFAM into SeqRecord objects.
26
27 The entire file is loaded, and any sequence can be accessed using
28 the [index] notation.
29
30 This parser will detect if the Stockholm file follows the PFAM conventions
31 for sequence specific meta-data (lines starting #=GS and #=GR) and
32 populates the SeqRecord fields accordingly.
33
34 Any annotation which does not follow the PFAM conventions is currently
35 ignored.
36
37 If an accession is provided for an entry in the meta data, IT WILL NOT
38 be used as the record.id (it will be recorded in the record's annotations).
39 This is because some files have (sub) sequences from different parts of
40 the same accession (differentiated by different start-end positions).
41
42 Wrap-around alignments are not supported - each sequences must be on
43 a single line. However, interlaced sequences should work.
44
45 For more information on the file format, please see:
46 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
47 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
48
49 For consistency with BioPerl and EMBOSS we call this the "stockholm"
50 format.
51 """
52
53 #These dictionaries should be kept in sync with those
54 #defined in the PfamStockholmWriter class.
55 pfam_gr_mapping = {"SS" : "secondary_structure",
56 "SA" : "surface_accessibility",
57 "TM" : "transmembrane",
58 "PP" : "posterior_probability",
59 "LI" : "ligand_binding",
60 "AS" : "active_site",
61 "IN" : "intron"}
62 #Following dictionary deliberately does not cover AC, DE or DR
63 pfam_gs_mapping = {"OS" : "organism",
64 "OC" : "organism_classification",
65 "LO" : "look"}
66
67 #TODO - Should the default be Gapped(single_letter_alphabet) instead?
69 """Create a StockholmIterator object (which returns SeqRecord objects).
70
71 handle - input file
72 alphabet - optional alphabet
73 """
74 self.handle = handle
75 self.alphabet = alphabet
76
77 self.sequences = {}
78 self.ids = []
79 self.ParseAlignment()
80 self.move_start()
81
83 line = self.handle.readline()
84 if not line:
85 #Empty file - just give up.
86 return
87 if not line.strip() == '# STOCKHOLM 1.0':
88 raise ValueError("Did not find STOCKHOLM header")
89 #import sys
90 #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0'
91
92 # Note: If this file follows the PFAM conventions, there should be
93 # a line containing the number of sequences, e.g. "#=GF SQ 67"
94 # We do not check for this - perhaps we should, and verify that
95 # if present it agrees with our parsing.
96
97 seqs = {}
98 ids = []
99 gs = {}
100 gr = {}
101 passed_end_alignment = False
102 while 1:
103 line = self.handle.readline()
104 if not line: break #end of file
105 line = line.strip() #remove trailing \n
106 if line == "//" :
107 #The "//" line indicates the end of the alignment.
108 #There may still be more meta-data
109 passed_end_alignment = True
110 elif line == "" :
111 #blank line, ignore
112 pass
113 elif line[0] <> "#" :
114 #Sequence
115 #Format: "<seqname> <sequence>"
116 assert not passed_end_alignment
117 parts = [x.strip() for x in line.split(" ",1)]
118 if len(parts) <> 2 :
119 #This might be someone attempting to store a zero length sequence?
120 raise ValueError("Could not split line into identifier " \
121 + "and sequence:\n" + line)
122 id, seq = parts
123 if id not in ids :
124 ids.append(id)
125 seqs.setdefault(id, '')
126 seqs[id] += seq.replace(".","-")
127 elif len(line) >= 5 :
128 #Comment line or meta-data
129 if line[:5] == "#=GF " :
130 #Generic per-File annotation, free text
131 #Format: #=GF <feature> <free text>
132 pass
133 elif line[:5] == '#=GC ':
134 #Generic per-Column annotation, exactly 1 char per column
135 #Format: "#=GC <feature> <exactly 1 char per column>"
136 pass
137 elif line[:5] == '#=GS ':
138 #Generic per-Sequence annotation, free text
139 #Format: "#=GS <seqname> <feature> <free text>"
140 id, feature, text = line[5:].strip().split(None,2)
141 #if id not in ids :
142 # ids.append(id)
143 if id not in gs :
144 gs[id] = {}
145 if feature not in gs[id] :
146 gs[id][feature] = [text]
147 else :
148 gs[id][feature].append(text)
149 elif line[:5] == "#=GR " :
150 #Generic per-Sequence AND per-Column markup
151 #Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
152 id, feature, text = line[5:].strip().split(None,2)
153 #if id not in ids :
154 # ids.append(id)
155 if id not in gr :
156 gr[id] = {}
157 if feature not in gr[id]:
158 gr[id][feature] = ""
159 gr[id][feature] += text.strip() # append to any previous entry
160 #TODO - Should we check the length matches the alignment length?
161 # For iterlaced sequences the GR data can be split over
162 # multiple lines
163 #Next line...
164
165
166 assert len(seqs) <= len(ids)
167 #assert len(gs) <= len(ids)
168 #assert len(gr) <= len(ids)
169
170 #This is some paranoia based on some pathelogical test cases.
171 if ids and seqs :
172 align_len = None
173 for id, seq in seqs.iteritems() :
174 assert id in ids
175 if align_len is None :
176 align_len = len(seq)
177 elif align_len <> len(seq) :
178 raise ValueError("Sequences have different lengths, or repeated identifier")
179
180
181 self.ids = ids
182 self.sequences = seqs
183 self.seq_annotation = gs
184 self.seq_col_annotation = gr
185 self.move_start()
186
188 return len(self.ids)
189
191 """Returns (name,start,end) string tuple from an identier"""
192 if identifier.find("/")<>-1 :
193 start_end = identifier.split("/",1)[1]
194 if start_end.count("-")==1 :
195 start, end = start_end.split("-")
196 name = identifier.split("/",1)[0]
197 return (name, start, end)
198 return (identifier, None, None)
199
201 """Takes an itentifier and returns dict of all meta-data matching it.
202
203 For example, given "Q9PN73_CAMJE/149-220" will return all matches to
204 this or "Q9PN73_CAMJE" which the identifier without its /start-end
205 suffix.
206
207 In the example below, the suffix is required to match the AC, but must
208 be removed to match the OS and OC meta-data.
209
210 # STOCKHOLM 1.0
211 #=GS Q9PN73_CAMJE/149-220 AC Q9PN73
212 ...
213 Q9PN73_CAMJE/149-220 NKA...
214 ...
215 #=GS Q9PN73_CAMJE OS Campylobacter jejuni
216 #=GS Q9PN73_CAMJE OC Bacteria
217
218 This function will return an empty dictionary if no data is found"""
219 name, start, end = self._identifier_split(identifier)
220 if name==identifier :
221 identifier_keys = [identifier]
222 else :
223 identifier_keys = [identifier, name]
224 answer = {}
225 for identifier_key in identifier_keys :
226 try :
227 for feature_key in meta_dict[identifier_key] :
228 answer[feature_key] = meta_dict[identifier_key][feature_key]
229 except KeyError :
230 pass
231 return answer
232
234 """Adds meta-date to a SecRecord's annotations dictionary.
235
236 This function applies the PFAM conventions."""
237
238 seq_data = self._get_meta_data(identifier, self.seq_annotation)
239 for feature in seq_data :
240 #Note this dictionary contains lists!
241 if feature=="AC" : #ACcession number
242 assert len(seq_data[feature])==1
243 record.annotations["accession"]=seq_data[feature][0]
244 elif feature=="DE" : #DEscription
245 record.description = "\n".join(seq_data[feature])
246 elif feature=="DR" : #Database Reference
247 #Should we try and parse the strings?
248 record.dbxrefs = seq_data[feature]
249 elif feature in self.pfam_gs_mapping :
250 record.annotations[self.pfam_gs_mapping[feature]] = ", ".join(seq_data[feature])
251 else :
252 #Ignore it?
253 record.annotations["GS:" + feature] = ", ".join(seq_data[feature])
254
255 seq_col_data = self._get_meta_data(identifier, self.seq_col_annotation)
256 for feature in seq_col_data :
257 #Note this dictionary contains strings!
258 if feature in self.pfam_gr_mapping :
259 record.annotations[self.pfam_gr_mapping[feature]] = seq_col_data[feature]
260 else :
261 #Ignore it?
262 record.annotations["GR:" + feature] = seq_col_data[feature]
263
265 """Provides random access to the SeqRecords."""
266 if i < 0 or i >= len(self.ids) : raise ValueError
267 id = self.ids[i]
268 seq_len = len(self.sequences[id])
269
270 name, start, end = self._identifier_split(id)
271
272 record = SeqRecord(Seq(self.sequences[id], self.alphabet), id=id, name=name, description=id)
273
274 if start : record.annotations["start"] = int(start)
275 if end : record.annotations["end"] = int(end)
276
277 #will be overridden by _populate_meta_data if an explicit accession is provided:
278 record.annotations["accession"]=name
279
280 self._populate_meta_data(id, record)
281
282 #DO NOT TOUCH self._n
283 return record
284
286 """Class to write PFAM style Stockholm format files.
287
288 Note that sequences and their annotation are recorded
289 together (rather than having a block of annotation followed
290 by a block of aligned sequences).
291 """
292
293 #These dictionaries should be kept in sync with those
294 #defined in the PfamStockholmIterator class.
295 pfam_gr_mapping = { "secondary_structure" : "SS",
296 "surface_accessibility" : "SA",
297 "transmembrane" : "TM",
298 "posterior_probability" : "PP",
299 "ligand_binding" : "LI",
300 "active_site" : "AS",
301 "intron" : "IN"}
302 #Following dictionary deliberately does not cover AC, DE or DR
303 pfam_gs_mapping = {"organism" : "OS",
304 "organism_classification" : "OC",
305 "look" : "LO"}
306
308 """Creates the writer object.
309
310 Use the method write_file() to actually record your sequence records."""
311 SequentialSequenceWriter.__init__(self, handle)
312 self._ids_written = []
313 self._length_of_sequences = None
314
316 """Must supply the number of records (count)."""
317 SequentialSequenceWriter.write_header(self) # sets flags
318 self.handle.write("# STOCKHOLM 1.0\n")
319 self.handle.write("#=GF SQ %i\n" % count)
320
322 """Use this to write an entire file containing the given records.
323
324 records - A list or iterator returning SeqRecord objects.
325 If len(records) is not supported, then it will be
326 converted into a list.
327
328 This method should only be called once for each file."""
329 try :
330 #This will work for a list, and some of the SeqIO
331 #iterators too, like the StockholmIterator
332 count = len(records)
333 except TypeError :
334 #Probably have an standard iterator, not a list...
335 records = list(records)
336 count = len(records)
337
338 if count == 0 :
339 raise ValueError("Must have at least one sequence")
340
341 self.write_header(count)
342 self.write_records(records)
343 self.write_footer()
344 #Don't automatically close the file. This would prevent
345 #things like writing concatenated alignments as used for
346 #phylogenetic bootstrapping (usually done with phylip).
347 #self.close()
348
350 """Write a single Stockholm record to the file."""
351 assert self._header_written
352 assert not self._footer_written
353 self._record_written = True
354
355 if self._length_of_sequences is None :
356 self._length_of_sequences = len(record.seq)
357 elif self._length_of_sequences <> len(record.seq) :
358 raise ValueError("Sequences must all be the same length")
359
360 if self._length_of_sequences == 0 :
361 raise ValueError("Non-empty sequences are required")
362
363 #For the case for stockholm to stockholm, try and use record.name
364 seq_name = record.id
365 if record.name is not None :
366 if "accession" in record.annotations :
367 if record.id == record.annotations["accession"] :
368 seq_name = record.name
369
370 #In the Stockholm file format, spaces are not allowed in the id
371 seq_name = seq_name.replace(" ","_")
372
373 if "start" in record.annotations \
374 and "end" in record.annotations :
375 suffix = "/%s-%s" % (str(record.annotations["start"]),
376 str(record.annotations["end"]))
377 if seq_name[-len(suffix):] <> suffix :
378 seq_name = "%s/%s-%s" % (seq_name,
379 str(record.annotations["start"]),
380 str(record.annotations["end"]))
381
382 if seq_name in self._ids_written :
383 raise ValueError("Duplicate record identifier: %s" % seq_name)
384 self._ids_written.append(seq_name)
385 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring()))
386
387 #The recommended placement for GS lines (per sequence annotation)
388 #is above the alignment (as a header block) or just below the
389 #corresponding sequence.
390 #
391 #The recommended placement for GR lines (per sequence per column
392 #annotation such as secondary structure) is just below the
393 #corresponding sequence.
394 #
395 #We put both just below the corresponding sequence as this allows
396 #us to write the file using a single pass through the records.
397
398 #AC = Accession
399 if "accession" in record.annotations :
400 self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.annotations["accession"])))
401 elif record.id :
402 self.handle.write("#=GS %s AC %s\n" % (seq_name, self.clean(record.id)))
403
404 #DE = description
405 if record.description :
406 self.handle.write("#=GS %s DE %s\n" % (seq_name, self.clean(record.description)))
407
408 #DE = database links
409 for xref in record.dbxrefs :
410 self.handle.write("#=GS %s DR %s\n" % (seq_name, self.clean(xref)))
411
412 #GS/GR = other per sequence annotation
413 for key in record.annotations :
414 if key in self.pfam_gs_mapping :
415 self.handle.write("#=GS %s %s %s\n" \
416 % (seq_name,
417 self.clean(self.pfam_gs_mapping[key]),
418 self.clean(str(record.annotations[key]))))
419 elif key in self.pfam_gr_mapping :
420 if len(str(record.annotations[key]))==len(record.seq) :
421 self.handle.write("#=GR %s %s %s\n" \
422 % (seq_name,
423 self.clean(self.pfam_gr_mapping[key]),
424 self.clean((record.annotations[key]))))
425 else :
426 #Should we print a warning?
427 pass
428 else :
429 #It doesn't follow the PFAM standards, but should we record this data anyway?
430 pass
431
432
433
434
446
447 if __name__ == "__main__" :
448 print "Testing..."
449 from cStringIO import StringIO
450
451
452 # This example with its slightly odd (partial) annotation is from here:
453 # http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
454 # I don't know what the "GR_O31699/88-139_IN ..." line is meant to be.
455 sth_example = \
456 """# STOCKHOLM 1.0
457 #=GF ID CBS
458 #=GF AC PF00571
459 #=GF DE CBS domain
460 #=GF AU Bateman A
461 #=GF CC CBS domains are small intracellular modules mostly found
462 #=GF CC in 2 or four copies within a protein.
463 #=GF SQ 67
464 #=GS O31698/18-71 AC O31698
465 #=GS O83071/192-246 AC O83071
466 #=GS O83071/259-312 AC O83071
467 #=GS O31698/88-139 AC O31698
468 #=GS O31698/88-139 OS Bacillus subtilis
469 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
470 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
471 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY
472 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE
473 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS
474 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
475 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
476 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
477 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
478 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
479 #=GR O31699/88-139 AS ________________*__________________________
480 #=GR_O31699/88-139_IN ____________1______________2__________0____
481 //
482 """
483
484 # Interlaced example from BioPerl documentation. Also note the blank line.
485 # http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
486 sth_example2 = \
487 """# STOCKHOLM 1.0
488 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
489 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
490 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
491 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
492 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
493
494 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
495 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
496 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
497 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
498 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
499 //"""
500
501 print "--------"
502 print "StockholmIterator(stockholm alignment file)"
503 iterator = StockholmIterator(StringIO(sth_example))
504 count=0
505 for record in iterator :
506 count=count+1
507 assert count == 5
508 #Check the first record...
509 assert record.id == 'O31699/88-139'
510 assert record.name == 'O31699'
511 assert record.description == 'O31699/88-139'
512 assert len(record.annotations)==4
513 assert record.annotations["accession"]=='O31699'
514 assert record.annotations["start"]==88
515 assert record.annotations["end"]==139
516 assert record.annotations["active_site"]=='________________*__________________________'
517
518 iterator = StockholmIterator(StringIO(sth_example))
519 count=0
520 for record in iterator :
521 count=count+1
522 break
523 assert count==1
524 #Check the last record...
525 assert record.id == 'O83071/192-246'
526 assert record.name == 'O83071'
527 assert record.description == 'O83071/192-246'
528 assert len(record.annotations)==4
529 assert record.annotations["accession"]=='O83071'
530 assert record.annotations["start"]==192
531 assert record.annotations["end"]==246
532 assert record.annotations["surface_accessibility"]=="999887756453524252..55152525....36463774777"
533
534 assert [r.id for r in StockholmIterator(StringIO(sth_example))] \
535 == ['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']
536
537 assert [r.name for r in StockholmIterator(StringIO(sth_example))] \
538 == ['O83071', 'O83071', 'O31698', 'O31698', 'O31699']
539
540 assert [r.description for r in StockholmIterator(StringIO(sth_example))] \
541 == ['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']
542
543
544 print "--------"
545 print "StockholmIterator(interlaced stockholm alignment file)"
546 iterator = StockholmIterator(StringIO(sth_example2))
547 record = iterator.next()
548 assert record.id == "AP001509.1"
549 assert len(record.seq) == 104
550 assert "secondary_structure" in record.annotations
551 assert len(record.annotations["secondary_structure"]) == 104
552 record = iterator.next()
553 assert record.id == "AE007476.1"
554 assert len(record.seq) == 104
555 assert "secondary_structure" in record.annotations
556 assert len(record.annotations["secondary_structure"]) == 104
557 record = iterator.next()
558 assert record is None
559
| Trees | Indices | Help |
|---|
| Generated by Epydoc 3.0.1 on Mon Sep 15 09:22:58 2008 | http://epydoc.sourceforge.net |