| Trees | Indices | Help |
|---|
|
|
1 """Martel expression for the hmmpfam database search program in hmmer.
2
3 This has been tested with version 2.2g. I should also make it work with
4 2.1.1 output.
5
6 XXX This isn't completely finished and doesn't do everything quite right.
7 The main two problems being that it isn't well tested for a wide variety of
8 outputs and that the family line is not parsed into it's respective parts
9 (see multitude of comments on this below).
10 """
11
12 import warnings
13 warnings.warn("Bio.expressions was deprecated, as it does not work with recent versions of mxTextTools. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning)
14
15
16
17 from Martel import *
18 from Martel import RecordReader
19 from Bio import Std
20
21 # -- header
22 # hmmpfam - search one or more sequences against HMM database
23 program_description = (Std.application_name(Str("hmmpfam")) +
24 ToEol())
25
26 # HMMER 2.2g (August 2001)
27 program_version = (Str("HMMER ") +
28 Std.application_version(Re(r"\d\.\d\w") |
29 Re(r"\d\.\d\.\d")) +
30 ToEol())
31
32 # Copyright (C) 1992-2001 HHMI/Washington University School of Medicine
33 # Freely distributed under the GNU General Public License (GPL)
34
35 copyright = (ToEol() +
36 ToEol())
37
38 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
39 # HMM file: /data/hmm/Pfam_fs
40 # Sequence file: hmmpfam_test.fasta
41 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
42
43 files = (ToEol() +
44 Str("HMM file:") + Spaces() +
45 Std.database_name(UntilEol()) + AnyEol() +
46 Str("Sequence file:") + Spaces() +
47 Group("inputfile_name", UntilEol()) + AnyEol() +
48 ToEol())
49
50 header = Std.search_header(program_description + program_version +
51 copyright + files)
52
53 # -- record
54
55 #
56 # Query sequence: AT1G01040
57 # Accession: [none]
58 # Description: Test Sequence for hmmpfam
59
60 sequence_info = (ToEol() + # blank line
61 Str("Query sequence:") + Spaces() +
62 Group("query_name", UntilEol()) + AnyEol() +
63 Str("Accession:") + Spaces() +
64 Group("query_accession", UntilEol()) + AnyEol() +
65 Str("Description:") + Spaces() +
66 Std.query_description(UntilEol()) + AnyEol())
67
68 # generic name for models
69 # most model names are something like PFwhatever, but we also have some
70 # different kinds like AAA and Sigma54_activat
71 model_name = Re(r"[\w-]+")
72
73 #
74 # Scores for sequence family classification (score includes all domains):
75 # Model Description Score E-value N
76 # -------- ----------- ----- ------- ---
77
78 family_header = (ToEol() + # blank line
79 Str("Scores") + ToEol() +
80 Str("Model") + ToEol() +
81 Str("-----") + ToEol())
82
83 # family lines can either be a hit:
84 # PF00636 RNase3 domain 354.9 1.1e-118 2
85 # or no hit:
86 # [no hits above thresholds]
87
88 no_hit_line = (Spaces() + Str("[no hits above thresholds]") + AnyEol())
89
90 family_hit_line = (Group("family_model", model_name) + Spaces() +
91 # XXX This is a pain in the ass, so I gave up
92 # We need to match descriptions but stop when we get to the
93 # score value.
94 # This is very difficult since the description can contain
95 # letters, numbers, (, ), /, - and periods. The numbers and
96 # periods leave me unable to think of a way to distinguish
97 # a "description word" from a score word.
98 # You also can't use spaces as descriptions can have 2
99 # spaces in them (not just one separating every word) and
100 # the description to score value can also be separated by
101 # only 2 spaces.
102 # I can't for the life of me figure this so I just
103 # eat the rest of the line for right now and downstream
104 # apps will have to get the information out from there.
105 ToEol("family_information"))
106 #Group("family_description",
107 # Rep1(Re(r"[a-zA-Z0-9_()/-]+") +
108 # Alt(Str(" "), Str(" ")))) +
109 # Alt(Spaces()) +
110 # Float("family_score") + Spaces() +
111 # Float("family_evalue") + Spaces() +
112 # Integer("family_n") + AnyEol())
113
114 #
115 # Parsed for domains:
116 # Model Domain seq-f seq-t hmm-f hmm-t score E-value
117 # -------- ------- ----- ----- ----- ----- ----- -------
118 domain_header = (ToEol() + # blank line
119 Str("Parsed for domains:") + AnyEol() +
120 Str("Model") + ToEol() +
121 Str("-----") + ToEol())
122
123 # domain lines can also be either a hit:
124 # PF00270 1/1 239 382 .. 1 141 [. 30.3 2.2e-09
125 # or not hit:
126 # [no hits above thresholds]
127
128 # I have no idea what these things exactly mean, but parse 'em anyways
129 symbol_forward = Str(".") | Str("[")
130 symbol_reverse = Str(".") | Str("]")
131 match_symbols = symbol_forward + symbol_reverse
132
133 domain_hit_line = (Group("domain_model", model_name) + Spaces() +
134 Group("domain_domain", Integer() + Str("/") + Integer()) +
135 Spaces() +
136 Integer("domain_seq-f") + Spaces() +
137 Integer("domain_seq-r") + Spaces() +
138 Group("domain_seq_symbols", match_symbols) + Spaces() +
139 Integer("domain_hmm-f") + Spaces() +
140 Integer("domain_hmm-t") + Spaces() +
141 Group("domain_hmm_symbols", match_symbols) + Spaces() +
142 Float("domain_score") + Spaces() +
143 Float("domain_evalue") + AnyEol())
144
145 #
146 # Alignments of top-scoring domains:
147 alignment_header = (ToEol() + # empty line
148 Str("Alignments of top-scoring domains:") +
149 AnyEol())
150
151 # PF00271: domain 1 of 1, from 686 to 767: score 58.6, E = 8.9e-17
152 # *->ell.epgikvarlhG.....glsqeeReeilekFrngkskvLvaTdv
153 # el ++ i++a++ G+++++++++++ ++++ kFr+g + +LvaT+v
154 # AT1G01040 686 ELPsLSFIRCASMIGhnnsqEMKSSQMQDTISKFRDGHVTLLVATSV 732
155 #
156 # aarGlDipdvnlVInydlpwnpesyiQRiGRaGRaG<-*
157 # a++GlDi ++n+V +dl +++ yiQ +GR +R
158 # AT1G01040 733 AEEGLDIRQCNVVMRFDLAKTVLAYIQSRGR-ARKP 767
159 #
160 domain_align_header = (Group("dalign_name", model_name) +
161 Str(": domain ") +
162 Integer("dalign_of_domain") +
163 Str(" of ") +
164 Integer("dalign_total_domain") +
165 Str(", from ") +
166 Integer("dalign_domain_start") +
167 Str(" to ") +
168 Integer("dalign_domain_end") +
169 Str(": score ") +
170 Float("dalign_score") +
171 Str(", E = ") +
172 Float("dalign_evalue") +
173 AnyEol())
174
175 # occasionally we have a line like:
176 # RF xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
177 # preceeding the alignment. I'm not sure what this means.
178 rf_line = (Spaces() + Str("RF ") + ToEol())
179
180 domain_align_top = (Spaces() +
181 UntilEol("dalign_match_top") + AnyEol())
182
183 domain_align_middle = (Spaces() +
184 UntilEol("dalign_match_middle") + AnyEol())
185
186 domain_align_bottom = (Spaces() +
187 ToSep("dalign_query_name", " ") + Spaces() +
188 # some match ends can have:
189 # AT1G01230 - -
190 # instead of an actual line, so there are fixes
191 # in here for that messed up case
192 Alt(Integer("dalign_query_start") + Spaces() +
193 Group("dalign_match_bottom",
194 Re("[\w\-]+")) + Spaces() +
195 Integer("dalign_query_end") +
196 Spaces() + AnyEol(),
197 Str("- ") + ToEol()) +
198 ToEol()) # blank line
199
200 domain_alignment = (domain_align_header +
201 Rep1(Opt(rf_line) +
202 domain_align_top +
203 domain_align_middle +
204 domain_align_bottom))
205
206 # //
207 record_end = Str("//") + AnyEol()
208
209
210 record = Std.record(Rep(sequence_info +
211 family_header +
212 (no_hit_line | Rep1(family_hit_line)) +
213 domain_header +
214 (no_hit_line | Rep1(domain_hit_line)) +
215 alignment_header +
216 (no_hit_line | Rep1(domain_alignment)) +
217 record_end
218 ))
219
220 format = HeaderFooter("hmmpfam", {},
221 header, RecordReader.CountLines, (8,),
222 record, RecordReader.EndsWith, ("//\n",),
223 None, None, None)
224
| Trees | Indices | Help |
|---|
| Generated by Epydoc 3.0.1 on Mon Sep 15 09:24:44 2008 | http://epydoc.sourceforge.net |