%mFastaFixer.m     %By Ali Mokdad
%This program removes repeated sequences from a FASTA alignment file by chosing,
%among repeated sequences with the same name, the one that most satisfies one of the following "Criteria"
%1- The sequence that contains most real Nucleotide letters (A,G,C, and U), or
%2- The sequence that contains least internal gaps and o's
%Remark: In this new program, sequences DON'T need to be sorted by name before. The program itself will sort them.

%clear  %clear by hand if you need to.
clear FastaOrganismNames
clear FastaComment
clear Sequences
%clc;
if ispc, sep='\'; else, sep='/';end

if ~exist('RUNAFTERmFastaFixerGUI','var')
    if (~exist('FastaPath','var'))||(exist('FastaFilename','var')&&(~strcmp(class(FastaFilename),'char')))
        if exist(strcat('..',sep,'FASTA_alignments'),'dir')
            OrigDir=cd;
            ind= findstr(sep,OrigDir);  %Doesn't work on MAC
            FastaPath=strcat(OrigDir(1:ind(length(ind))),'FASTA_alignments',sep);
        else FastaPath=strcat(cd,sep);
        end
        %chdir ../FASTA_alignments/
    elseif FastaPath==0
        FastaPath=strcat(cd,sep);;
    end
    [PreFastaAlign, pathname] = uigetfile({'*.fasta;'},'Pick a fasta file to fix',FastaPath);
    % FastaFilename,FastaPath
end

tic

warning off MATLAB:nonIntegerTruncatedInConversionToChar

% PreFastaAlign = ('Trial.fasta');
% PreFastaAlign = '23S_ABE_2003_UNIQUE.fasta';
% PreFastaAlign = ('16S_ABE_2003.fasta');
% PreFastaAlignIndex = 3;
% PreFastaAlignList = {'2003_23S_Archaea.fasta','2003_23S_Bacteria.fasta','2003_23S_Eukarya.fasta'};
% PreFastaAlignList = {'2003_16S_Archaea.fasta','2003_16S_Bacteria.fasta','2003_16S_Eukarya.fasta'};
% PreFastaAlign = PreFastaAlignList{PreFastaAlignIndex};

fprintf('%s %s %s\n','Reading original (_GapsAdded) FASTA alignment from the file',PreFastaAlign,'...')

tline = 1;                     %tline is initiated for the while loop below, so that it starts

DroppedSeq = 1;
try
%     OrigDir=cd;
%     cd ../TripleAlignmentAnalysis/
    [FastaComment, Sequences] = mfastaread(strcat(pathname,PreFastaAlign)); %mfastaread is based on fastaread from MATLAB Bioinformatics Toolbox, with some additions .... THIS DOES NOT WORK WITH FASTA SEQUENCES WITH VERY LONG SEQUENCES (>4095 for default bufsize)
for i=1:length(FastaComment)
    ind=findstr(' ',FastaComment{i});
    if length(ind)>1
        FastaOrganismNames{i}=FastaComment{i}(1,1:ind(2)-1);
    else
        FastaOrganismNames{i}=FastaComment{i};
    end
end

%     cd(OrigDir)
catch %the following code is MUCH MUCH slower, but it works for any length of sequences
    fprintf('%s\n','Reading original Fasta sequence all at once failed, now reading line by line, please be patient this might take some time...')
    fid = fopen(strcat(pathname,PreFastaAlign));             %Again, this time to do real work!
    SeqCount = 0; p = 0; q = 0;%p and q are indices for the position and length of each line in the sequence
    while tline ~= -1                       %tline is initiated above
        tline = fgetl(fid);                 %Reads the file fid line by line, at the end of the file nothing will be read and tline will be -1, terminating the while loop
        if strncmpi(tline,'>',1)            %Checks if this is the first (comment) line of a fasta sequence
            SeqCount = SeqCount + 1;
            if(SeqCount/100)==fix(SeqCount/100),fprintf('%g\n',SeqCount);end
            %fprintf('%g\n',SeqCount);
            FastaComment{SeqCount} = tline(2:length(tline));

            index = findstr(' ',tline);                     %The Name of the sequence ends with the first space in the fasta comment line(?)
            if isempty(index), index = length(tline)+1;end  %In case there are no spaces in the fasta commnet line
            %        FastaOrganismNames{SeqCount} = tline(2:index-1);         %so this is only the first part (until the first space) of the FastaComment without the > sign
            if (~isempty(str2num(tline(index-1))))&&(isreal(str2num(tline(index-1))))%This means this tline(index-1) is a real number (not i or j), not a part of the name (like Escher_coli1 and Escher_coli2 ...)
                nums=1;
                if ~isempty(str2num(tline(index-2)))&&(isreal(str2num(tline(index-2))))% also check the previous character, maybe it too is a real number. (Don't go any further than 2, maybe the name itself is a number--risky!)
                    nums=2; %overwrites the first assigned value for this variable (1)
                end
            else nums =0;
            end

            FastaOrganismNames{SeqCount} = tline(2:index-1-nums);         %so this is only the first part (until 1 character before the first space, especially designed for 5S) of the FastaComment without the > sign
            p = 0; q = 0;                   %Zero these for the next (already started) sequence

        else
            p = q + p;
            q = length(tline);
            if tline ~= -1
                Sequences(SeqCount,p+1:q+p) = tline;
            end
        end

    end
    fclose(fid);
end

mSortSequences

for SeqCount=1:length(Sequences(:,1))%Only one of the following two criteria should be chosen
    IntGaps(SeqCount) = mInternalGapCounter(Sequences(SeqCount,:)); %This is a function that determines the number of internal gaps in the Sequences
    TotACGU(SeqCount) = mTotalACGUCounter(Sequences(SeqCount,:));%This determines the number of real nucleotides (A,C,G, and U) in the Sequences
end

mFastaPrinter
toc
