| Home | Trees | Indices | Help |
|
|---|
|
|
1 """
2 FASTA format sequence I/O.
3
4 This module provides parsers and writers for sequences and alignments in
5 FASTA format. The most basic usage is:
6
7 >>> parser = SequenceParser()
8 >>> parser.parse_file('sequences.fa')
9 <SequenceCollection> # collection of L{AbstractSequence}s
10
11 This will load all sequences in memory. If you are parsing a huge file,
12 then you could efficiently read the file sequence by sequence:
13
14 >>> for seq in parser.read('sequences.fa'):
15 ... # seq is an L{AbstractSequence}
16
17 L{BaseSequenceParser} is the central class in this module, which defines a
18 common infrastructure for all sequence readers. L{SequenceParser} is a standard
19 implementation, and L{PDBSequenceParser} is specialized to read FASTA sequences
20 with PDB headers.
21
22 For parsing alignments, have a look at L{SequenceAlignmentReader} and
23 L{StructureAlignmentFactory}.
24
25 Finally, this module provides a number of L{OutputBuilder}s, which know how to
26 write L{AbstractSequence} and L{AbstractAlignment} objects to FASTA files:
27
28 >>> with open('file.fa', 'w') as out:
29 builder = OutputBuilder.create(AlignmentFormats.FASTA, out)
30 builder.add_alignment(alignment)
31 builder.add_sequence(sequence)
32 ...
33
34 or you could instantiate any of the L{OutputBuilder}s directly.
35 """
36
37 import csb.io
38 import csb.core
39
40 from abc import ABCMeta, abstractmethod
41
42 from csb.bio.sequence import SequenceTypes, SequenceAlphabets, AlignmentFormats, SequenceError
43 from csb.bio.sequence import SequenceAlignment, StructureAlignment, A3MAlignment
44 from csb.bio.sequence import SequenceCollection, AbstractSequence, Sequence, RichSequence, ChainSequence
48 """
49 FASTA parser template. Subclasses must implement the way FASTA strings are
50 handled by overriding C{BaseSequenceParser.read_sequence}.
51
52 @param product: sequence product factory (an L{AbstractSequence} subclass)
53 @type product: type
54 @param product_type: default L{SequenceTypes} member for the products
55 @type product_type: L{EnumItem}
56 """
57 __metaclass__ = ABCMeta
58
60
61 self._product = None
62
63 if not issubclass(product, AbstractSequence):
64 raise TypeError(product)
65
66 if not product_type.enum is SequenceTypes:
67 raise TypeError(product_type)
68
69 self._product = product
70 self._type = product_type
71
72 @property
79
80 @property
82 """
83 Default sequence type of the products - a member of L{SequenceTypes}
84 @rtype: enum item
85 """
86 return self._type
87
88 @abstractmethod
90 """
91 Parse a single FASTA string
92
93 @return: a new sequence, created with L{BaseSequenceParser.product_factory}
94 @rtype: L{AbstractSequence}
95 """
96 pass
97
99 """
100 Read FASTA sequences from an (m)FASTA-formatted string
101
102 @param fasta_string: FASTA string to parse
103 @type fasta_string: str
104
105 @return: a list of L{Sequence}s
106 @rtype: L{SequenceCollection}
107 """
108
109 stream = csb.io.MemoryStream()
110 stream.write(fasta_string)
111
112 return self.parse_file(stream)
113
115 """
116 Read FASTA sequences from a (m)FASTA file
117
118 @param fasta_file: input FASTA file name or opened stream
119 @type fasta_file: str, file
120
121 @return: a list of L{Sequence}s
122 @rtype: L{SequenceCollection}
123 """
124 if isinstance(fasta_file, csb.core.string):
125 stream = open(fasta_file)
126 else:
127 stream = fasta_file
128
129 seqs = []
130 reader = csb.io.EntryReader(stream, AbstractSequence.DELIMITER, None)
131
132 for entry in reader.entries():
133 seqs.append(self.read_sequence(entry))
134
135 return SequenceCollection(seqs)
136
138 """
139 Read FASTA sequences from an (m)FASTA file.
140
141 @param fasta_file: input FASTA file name or opened stream
142 @type fasta_file: str, file
143
144 @return: efficient cursor over all L{Sequence}s (parse on demand)
145 @rtype: iterator
146 """
147 if isinstance(fasta_file, csb.core.string):
148 stream = open(fasta_file)
149 else:
150 stream = fasta_file
151
152 reader = csb.io.EntryReader(stream, AbstractSequence.DELIMITER, None)
153
154 for entry in reader.entries():
155 yield self.read_sequence(entry)
156
158 """
159 Standard FASTA parser. See L{BaseSequenceParser} for details.
160 """
161
163
164 lines = string.strip().splitlines()
165
166 if not lines[0].startswith(AbstractSequence.DELIMITER):
167 lines = [''] + lines
168 if len(lines) < 2:
169 raise ValueError('Empty FASTA entry')
170
171 header = lines[0]
172 id = header[1:].split()[0]
173 sequence = ''.join(lines[1:])
174
175 return self.product_factory(id, header, sequence, self.product_type)
176
178 """
179 PDB FASTA parser. Reads the PDB ID and sequence type from the header.
180 See L{BaseSequenceParser} for more details.
181 """
182
184
185 seq = super(PDBSequenceParser, self).read_sequence(string)
186
187 if not (seq.header and seq.id) or not (len(seq.id) in(5, 6) and seq.header.find('mol:') != -1):
188 raise ValueError('Does not look like a PDB header: {0}'.format(seq.header))
189
190 seq.id = seq.id.replace('_', '')
191 stype = seq.header.partition('mol:')[2].partition(' ')[0]
192 try:
193 seq.type = csb.core.Enum.parse(SequenceTypes, stype)
194 except csb.core.EnumValueError:
195 seq.type = SequenceTypes.Unknown
196
197 return seq
198
201
203
204 self._temp = []
205 self._insertion = insertion
206
207 for sequence in sequences:
208
209 row = list(sequence.sequence)
210 row.reverse()
211 self._temp.append(row)
212
214
215 sequences = self._temp
216
217 column = [ ]
218 has_insertion = False
219
220 for sequence in sequences:
221 if len(sequence) > 0 and sequence[-1].islower():
222 has_insertion = True
223
224 for sequence in sequences:
225 try:
226 if has_insertion and not sequence[-1].islower():
227 column.append(self._insertion)
228 else:
229 column.append(sequence.pop())
230 except IndexError:
231 column.append('')
232
233 if not any(column):
234 raise StopIteration()
235
236 return column
237
240
242 return self.next()
243
245 """
246 Sequence alignment parser.
247
248 @param product_type: default L{SequenceTypes} member for the sequence products
249 @type product_type: L{EnumItem}
250 @param strict: if True, raise exception on duplicate sequence identifiers.
251 See L{csb.bio.sequence.AbstractAlignment} for details
252 @type strict: bool
253 """
254
256
257 if not product_type.enum is SequenceTypes:
258 raise TypeError(product_type)
259
260 self._type = product_type
261 self._strict = bool(strict)
262
263 @property
265 """
266 Default sequence type of the alignment entries - a member of L{SequenceTypes}
267 @rtype: enum item
268 """
269 return self._type
270
271 @property
278
280 """
281 Parse an alignment in multi-FASTA format.
282
283 @param string: alignment string
284 @type string: str
285
286 @rtype: L{SequenceAlignment}
287 """
288
289 parser = SequenceParser(RichSequence, self.product_type)
290 sequences = parser.parse_string(string)
291
292 return SequenceAlignment(sequences, strict=self.strict)
293
295 """
296 Parse an alignment in A3M format.
297
298 @param string: alignment string
299 @type string: str
300
301 @rtype: L{A3MAlignment}
302 """
303 alphabet = SequenceAlphabets.get(self.product_type)
304
305 # parse all "mis-aligned" sequences as case-sensitive strings
306 parser = SequenceParser(Sequence, self.product_type)
307 sequences = parser.parse_string(string)
308
309 # storage for expanded sequences
310 s = []
311
312 for dummy in sequences:
313 s.append([])
314
315 # expand all sequences with insertion characters and make them equal length
316 for column in A3MSequenceIterator(sequences, str(alphabet.INSERTION)):
317 for sn, char in enumerate(column):
318 s[sn].append(char)
319
320 # build normal sequence objects from the equalized sequence strings
321 aligned_seqs = []
322
323 for sn, seq in enumerate(sequences):
324
325 sequence = RichSequence(seq.id, seq.header, s[sn], self.product_type)
326 aligned_seqs.append(sequence)
327
328 return A3MAlignment(aligned_seqs, strict=self.strict)
329
331 """
332 Protein structure alignment parser.
333
334 In order to construct the structural alignment, this factory needs a PDB
335 structure provider: an object, whose C{provider.get} method returns a
336 L{csb.bio.structute.Structure} for a given sequence identifier. Sequence
337 identifiers on the other hand need to be split into 'accession number'
338 and 'chain ID'. By default this is done using a standard PDB Entry ID
339 factory, but clients are free to provide custom factories. An C{id_factory}
340 must be a callable, which accepts a single string identifier and returns
341 an EntryID object.
342
343 @param provider: data source for all structures found in the alignment
344 @type provider: L{csb.bio.io.wwpdb.StructureProvider}
345 @param id_factory: callable factory, which transforms a sequence ID into
346 a L{csb.bio.io.wwpdb.EntryID} object. By default
347 this is L{csb.bio.io.wwpdb.EntryID.create}.
348 @type id_factory: callable
349 @param strict: if True, raise exception on duplicate sequence identifiers.
350 See L{csb.bio.sequence.AbstractAlignment} for details
351 @type strict: bool
352 """
353
355
356 from csb.bio.io.wwpdb import EntryID
357
358 if id_factory is None:
359 id_factory = EntryID.create
360 if not hasattr(id_factory, '__call__'):
361 raise TypeError(id_factory)
362
363 if not hasattr(provider, 'get'):
364 raise TypeError(provider)
365
366 self._type = SequenceTypes.Protein
367 self._strict = bool(strict)
368 self._provider = provider
369 self._id_factory = id_factory
370
371 @property
373 """
374 Default sequence type of the alignment rows - a member of L{SequenceTypes}
375 @rtype: enum item
376 """
377 return self._type
378
379 @property
381 """
382 Current L{csb.bio.io.wwpdb.StructureProvider} instance in use
383 @rtype: L{StructureProvider}
384 """
385 return self._provider
386
387 @property
389 """
390 Current L{csb.bio.io.wwpdb.EntryID} factory instance in use
391 @rtype: L{EntryID}
392 """
393 return self._id_factory
394
395 @property
402
404 """
405 Build a protein structure alignment given a multi-FASTA string
406 and the current structure C{provider}.
407
408 @param string: alignment string
409 @type string: str
410
411 @rtype: L{SequenceAlignment}
412
413 @raise SequenceError: when an aligned sequence is not a proper
414 subsequence of its respective source PDB chain
415 @raise StructureNotFoundError: if C{provider} can't provide a structure
416 for a given sequence ID
417 @raise InvalidEntryIDError: if a given sequence ID cannot be parsed
418 """
419
420 entries = []
421 parser = SequenceParser(Sequence, self.product_type)
422 sequences = parser.parse_string(string)
423
424 for row in sequences:
425 id = self.id_factory(row.id)
426 chain = self.provider.get(id.accession).chains[id.chain]
427
428 entry = self.make_entry(row, chain)
429 entries.append(entry)
430
431 return StructureAlignment(entries, strict=self.strict)
432
434 """
435 Build a protein structure alignment entry, given a sequence alignment
436 entry and its corresponding source PDB chain.
437
438 @param row: sequence alignment entry (sequence with gaps)
439 @type row: L{AbstractSequence}, L{SequenceAdapter}
440 @param chain: source PDB chain
441 @type chain: L{csb.bio.structure.Chain}
442
443 @return: gapped chain sequence, containing cloned residues from the
444 source chain (except for the gaps)
445 @rtype: L{ChainSequence}
446 @raise SequenceError: when C{row} is not a proper subsequence of C{chain}
447 """
448 offset = 1
449 residues = []
450 sequence = row.strip().sequence.upper()
451
452 try:
453 start = chain.sequence.index(sequence) + 1
454 except ValueError:
455 raise SequenceError('{0}: not a subsequence of {1}'.format(row.id, chain.entry_id))
456
457 for rinfo in row.residues:
458
459 if rinfo.type == row.alphabet.GAP:
460 residues.append(rinfo)
461 continue
462 else:
463 rank = start + offset - 1
464 assert chain.residues[rank].type == rinfo.type
465 residues.append(chain.residues[rank].clone())
466 offset += 1
467 continue
468
469 return ChainSequence(row.id, row.header, residues, chain.type)
470
473 """
474 Base sequence/alignment string format builder.
475
476 @param output: destination stream, where the product is written.
477 @type output: file
478 @param headers: if False, omit headers
479 @type headers: bool
480
481 @note: File builders are not guaranteed to check the correctness of the
482 product. It is assumed that the client of the builder knows what
483 it is doing.
484 """
485 __metaclass__ = ABCMeta
486 _registry = {}
487
489
490 if not hasattr(output, 'write'):
491 raise TypeError(output)
492
493 self._out = output
494 self._headers = bool(headers)
495
496 @staticmethod
498 """
499 Create an output builder, which knows how to handle the specified
500 sequence/alignment C{format}. Additional arguments are passed to the
501 builder's constructor.
502
503 @param format: L{AlignmentFormats} member
504 @type format: L{EnumItem}
505 @rtype: L{OutputBuilder}
506 """
507 if format not in OutputBuilder._registry:
508 raise ValueError('Unhandled format: {0}'.format(format))
509
510 klass = OutputBuilder._registry[format]
511 return klass(*a, **k)
512
513 @staticmethod
515 """
516 Register a new output builder.
517
518 @param format: L{AlignmentFormats} member
519 @type format: L{EnumItem}
520 @param klass: builder class (L{OutputBuilder} sub-class)
521 @type klass: type
522 """
523 assert format not in OutputBuilder._registry
524 assert issubclass(klass, OutputBuilder)
525
526 OutputBuilder._registry[format] = klass
527
528 @property
535
536 @property
538 """
539 True if sequence headers will be written to the destination
540 @rtype: bool
541 """
542 return self._headers
543
549
551 """
552 Write a chunk of C{text}, followed by a newline terminator.
553 """
554 self._out.write(text)
555 self._out.write('\n')
556
557 @abstractmethod
559 """
560 Format and append a new sequence to the product.
561 @type sequence: L{AbstractSequence}
562 """
563 pass
564
566 """
567 Format and append a collection of L{AbstractSequence}s to the product.
568 @type sequences: iterable of L{AbstractSequence}s
569 """
570 for s in sequences:
571 self.add_sequence(s)
572
573 @abstractmethod
575 """
576 Format and append an alignment to the product.
577 @type alignment: L{AbstractAlignment}
578 """
579 pass
580
586
588 """
589 Append a comment line to the product.
590
591 @param text: comment text
592 @type text: str
593 @param comment: comment prefix
594 @type comment: str
595 @param length: maximal comment length
596 @type length: int
597 """
598 for i in range(0, len(text), length):
599 self.write(comment)
600 self.write(' ')
601 self.write(text[i : i + length])
602 self.write('\n')
603
605 """
606 Formats sequences as standard FASTA strings. See L{OutputBuilder}.
607 """
608 FORMAT = AlignmentFormats.FASTA
609
611
612 if self.headers:
613 self.write(AbstractSequence.DELIMITER)
614 self.writeline(sequence.header)
615
616 insertion = str(sequence.alphabet.INSERTION)
617 gap = str(sequence.alphabet.GAP)
618
619 self.writeline(sequence.sequence.replace(insertion, gap))
620
624
626 """
627 Formats sequences as A3M strings. When appending an alignment, this builder
628 will write all insertion-containing columns in lower case. Also, gap symbols
629 are omitted if the respective columns contain insertions.
630
631 See L{OutputBuilder}.
632 """
633 FORMAT = AlignmentFormats.A3M
634
636
637 if self.headers:
638 self.write(AbstractSequence.DELIMITER)
639 self.writeline(sequence.header)
640
641 self.writeline(sequence.sequence)
642
644
645 if isinstance(alignment, A3MAlignment):
646 self._add_a3m(alignment)
647 else:
648 self._add_proper(alignment)
649
651
652 for s in alignment.rows:
653
654 if self.headers:
655 self.write(AbstractSequence.DELIMITER)
656 self.writeline(s.header)
657
658 sequence = []
659
660 for ci in s.columns:
661
662 if ci.residue.type != s.alphabet.INSERTION:
663 char = str(ci.residue.type)
664
665 if alignment.insertion_at(ci.column):
666 sequence.append(char.lower())
667 else:
668 sequence.append(char)
669 else:
670 continue
671
672 self.writeline(''.join(sequence))
673
675
676 for s in alignment.rows:
677
678 if self.headers:
679 self.write(AbstractSequence.DELIMITER)
680 self.writeline(s.header)
681
682 master = alignment.rows[1]
683 sequence = []
684
685 for ci in s.columns:
686 char = str(ci.residue.type)
687
688 if master.columns[ci.column].residue.type == master.alphabet.GAP:
689 if ci.residue.type == s.alphabet.GAP:
690 continue
691 else:
692 sequence.append(char.lower())
693 else:
694 sequence.append(char)
695
696 self.writeline(''.join(sequence))
697
699 """
700 Formats sequences as PIR FASTA strings, recognized by Modeller.
701 See L{OutputBuilder} for general alignment documentation.
702 """
703 FORMAT = AlignmentFormats.PIR
704
706 self._add(sequence, template=True)
707
709
710 for n, sequence in enumerate(alignment.rows, start=1):
711 if n == 1:
712 self.add_target(sequence)
713 else:
714 self.add_template(sequence)
715
716
718 self._add(sequence, template=False)
719
721 self._add(sequence, template=True)
722
724
725 if self.headers:
726
727 if template:
728 type = 'structure'
729 else:
730 type = 'sequence'
731
732 id = sequence.id
733 start = end = '.'
734
735 if hasattr(sequence, 'start') and hasattr(sequence, 'end'):
736 start = sequence.start
737 end = sequence.end
738
739 header = 'P1;{0}\n{2}:{0}:{3}:{1}:{4}:{1}::::'.format(id[:-1], id[-1], type, start, end)
740
741 self.write(AbstractSequence.DELIMITER)
742 self.writeline(header)
743
744 insertion = str(sequence.alphabet.INSERTION)
745 gap = str(sequence.alphabet.GAP)
746
747 self.write(sequence.sequence.replace(insertion, gap))
748 self.writeline('*')
749
750
751 # register builders
752 for klass in OutputBuilder.__subclasses__():
753 OutputBuilder.register(klass.FORMAT, klass)
754
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Fri Jul 5 14:24:56 2013 | http://epydoc.sourceforge.net |