-
Notifications
You must be signed in to change notification settings - Fork 12
/
cPecanAlign.c
164 lines (147 loc) · 7.17 KB
/
cPecanAlign.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#include "sonLib.h"
#include "pairwiseAligner.h"
#include "multipleAligner.h"
#include "commonC.h"
static void usage(char *argv[]) {
fprintf(stderr, "%s fasta_target fasta_query\n", argv[0]);
}
// Returns a hash mapping from sequence header to sequence data.
static stHash *readFastaFile(char *filename) {
FILE *fasta = fopen(filename, "r");
if (fasta == NULL) {
st_errnoAbort("Could not open fasta file %s", filename);
}
stHash *headerToData = stHash_construct3(stHash_stringKey,
stHash_stringEqualKey,
free,
free);
struct List *seqs = constructEmptyList(0, NULL);
struct List *seqLengths = constructEmptyList(0, free);
struct List *headers = constructEmptyList(0, free);
fastaRead(fasta, seqs, seqLengths, headers);
for (int64_t i = 0; i < seqs->length; i++) {
char *fullHeader = headers->list[i];
stList *headerTokens = stString_splitByString(fullHeader, " ");
char *usableHeader = stString_copy(stList_get(headerTokens, 0));
stHash_insert(headerToData, usableHeader, seqs->list[i]);
stList_destruct(headerTokens);
}
destructList(seqs);
destructList(seqLengths);
destructList(headers);
return headerToData;
}
// copied from cPecanRealign, which is sloppy.
void *convertToAnchorPair(void *aPair, void *extraArg) {
stIntTuple *i = stIntTuple_construct3(stIntTuple_get(aPair, 1), stIntTuple_get(aPair, 2), 0);
stIntTuple_destruct(aPair);
return i;
}
// copied from cPecanRealign
struct PairwiseAlignment *convertAlignedPairsToPairwiseAlignment(char *seqName1, char *seqName2, double score,
int64_t length1, int64_t length2, stList *alignedPairs) {
//Make pairwise alignment
int64_t pX = -1, pY = -1, mL = 0;
//Create an end matched pair, which is used to ensure the alignment has the correct end indels.
struct List *opList = constructEmptyList(0, (void (*)(void *)) destructAlignmentOperation);
stList_append(alignedPairs, stIntTuple_construct3(length1, length2, 0));
for (int64_t i = 0; i < stList_length(alignedPairs); i++) {
stIntTuple *alignedPair = stList_get(alignedPairs, i);
int64_t x = stIntTuple_get(alignedPair, 0);
int64_t y = stIntTuple_get(alignedPair, 1);
assert(x - pX > 0);
assert(y - pY > 0);
if (x - pX > 0 && y - pY > 0) { //This is a hack for filtering
if (x - pX > 1) { //There is an indel.
if (mL > 0) {
listAppend(opList, constructAlignmentOperation(PAIRWISE_MATCH, mL, 0));
mL = 0;
}
listAppend(opList, constructAlignmentOperation(PAIRWISE_INDEL_X, x - pX - 1, 0));
}
if (y - pY > 1) {
if (mL > 0) {
listAppend(opList, constructAlignmentOperation(PAIRWISE_MATCH, mL, 0));
mL = 0;
}
listAppend(opList, constructAlignmentOperation(PAIRWISE_INDEL_Y, y - pY - 1, 0));
}
mL++;
pX = x;
pY = y;
}
}
//Deal with a trailing match, but exclude the final match
if (mL > 1) {
listAppend(opList, constructAlignmentOperation(PAIRWISE_MATCH, mL - 1, 0));
}
stIntTuple_destruct(stList_pop(alignedPairs));
//Construct the alignment
struct PairwiseAlignment *pA = constructPairwiseAlignment(seqName1, 0, length1, 1, seqName2, 0, length2, 1, score,
opList);
return pA;
}
int main(int argc, char *argv[]) {
// Parse arguments
if (argc != 3) {
usage(argv);
return 1;
}
// You would load a custom HMM here if you wanted using
// hmm_getStateMachine (see the realign code)
StateMachine *stateMachine = stateMachine5_construct(fiveState);
PairwiseAlignmentParameters *parameters = pairwiseAlignmentBandingParameters_construct();
stHash *targetSequences = readFastaFile(argv[1]);
stHash *querySequences = readFastaFile(argv[2]);
// For each query sequence, align it against all target sequences.
stHashIterator *queryIt = stHash_getIterator(querySequences);
char *queryHeader;
while ((queryHeader = stHash_getNext(queryIt)) != NULL) {
char *querySeq = stHash_search(querySequences, queryHeader);
stHashIterator *targetIt = stHash_getIterator(targetSequences);
char *targetHeader;
while ((targetHeader = stHash_getNext(targetIt)) != NULL) {
char *targetSeq = stHash_search(targetSequences, targetHeader);
// Here we should try both the target sequence and its
// reverse-complemented version
// Aligns the sequences.
// If you have alignment constraints (anchors) you should
// replace this with getAlignedPairsUsingAnchors.
stList *alignedPairs = getAlignedPairs(stateMachine, targetSeq,
querySeq, parameters,
true, true);
// Takes into account the probability of aligning to a
// gap, by transforming the posterior probability into the
// AMAP objective function (see Schwartz & Pachter, 2007).
alignedPairs = reweightAlignedPairs2(alignedPairs, strlen(targetSeq),
strlen(querySeq),
parameters->gapGamma);
// I think this calculates the optimal ordered set of
// alignments from the unordered set of aligned pairs, not
// completely sure.
alignedPairs = filterPairwiseAlignmentToMakePairsOrdered(alignedPairs,
targetSeq,
querySeq,
// This parameter says that the minimum posterior probability we will accept has to be at least 0.9.
0.9);
// After this the "aligned pairs" data structure changes,
// which is a little sketchy. It's just so that the
// alignment can be printed properly.
stList_mapReplace(alignedPairs, convertToAnchorPair, NULL);
stList_sort(alignedPairs, (int (*)(const void *, const void *)) stIntTuple_cmpFn);
struct PairwiseAlignment *alignment = convertAlignedPairsToPairwiseAlignment(targetHeader, queryHeader,
0, strlen(targetSeq), strlen(querySeq), alignedPairs);
// Output the cigar string
cigarWrite(stdout, alignment, 0);
stList_destruct(alignedPairs);
destructPairwiseAlignment(alignment);
}
stHash_destructIterator(targetIt);
}
stHash_destructIterator(queryIt);
// Clean up
stHash_destruct(targetSequences);
stHash_destruct(querySequences);
pairwiseAlignmentBandingParameters_destruct(parameters);
stateMachine_destruct(stateMachine);
}