%mMutualCounter_IF.m   %produces a covariation list of any entry length

%(_IF for Isostericity Filter)
%Takes most time when the PositionList is big

warning off all
% if ispc, sep='\'; else, sep='/';end

mkdir(strcat('..',filesep,'Output'));
OrigDir=cd;
ind= findstr(filesep,OrigDir);
OutputPath=strcat(OrigDir(1:ind(end)),'Output',filesep);


if Percentages==1
    PercentCharacter = 'Percents';
else
    PercentCharacter = 'Counts';
    mReadSCOREweights  %Reads text file containing the isosteric subfamily weights to be used with scores (below)
end

for P=ChosenDomains
    NSeq_all(P) = DomainLimits(P+1)-DomainLimits(P);
    if NSeq_all(P)==0 %To prevent the error that would occur if the user choses to use more domains than available in the sequence alignment
        NumberOfDomainsError=1;
        if Percentages==0 %since now We analyze immediately for both counts and Percentages (Percentages=0 and 1), there is no sense in printing the error message twice
            fprintf('%s\t%s\n',DomainNames{P},'does not exist');
        end
    end
end


if min(FoundOrganism)==1
    if ~exist('NumberOfDomainsError','var')
        %tic
        %for xlsfile = 1:length(PositionList)
        %InputFile = strcat(pathname,PositionList);
        %mGetPositions
        
        if ShowNs ==  1,if ShowGaps ==  1,  Bases   =   {'A' 'C' 'G' 'U' 'o' '-'};      end,end
        if ShowNs ==  0,if ShowGaps ==  0,  Bases   =   {'A' 'C' 'G' 'U'};              end,end
        if ShowNs ==  1,if ShowGaps ==  0,  Bases   =   {'A' 'C' 'G' 'U' 'o'};          end,end
        if ShowNs ==  0,if ShowGaps ==  1,  Bases   =   {'A' 'C' 'G' 'U' '-'}; M(:,5,:)=13; M(5,:,:)=13; end,end %This overwrites some of the values from mIsostericity to correct for the fact that here, '-' (gaps) are the 5th and not 6th column.
        
        if length(InputData(1,:)) == 2 %In the case when we have a basepair list (Only two columns of numbers in the input .xls file)

            OUTforbidden    = strcat(OutputPath,PositionList,'_',FastaFilename,'_Forbidden.out');
            OUTgaps         = strcat(OutputPath,PositionList,'_',FastaFilename,'_Gaps.out');
            %         OUTallowed      = strcat(OutputPath,PositionList,'_',FastaFilename,'_Allowed.out');

% % % % %             fidOUTforbidden =   fopen(OUTforbidden,'w+');
% % % % %             fidOUTgaps      =   fopen(OUTgaps,'w+');
% % % % %             fidOUTallowed   =   0;%fopen(OUTallowed,'w+');
% % % % % 
% % % % %             fprintf(fidOUTforbidden,'%s\t%s\t%s\t%s\t%s\t%s\t%s\t\t%s\n','UN1','UN1','LN1','LN2','Inter','Domain','Forbidden Count','Sequences Names');
% % % % %             fprintf(fidOUTgaps,'%s\t%s\t%s\t%s\t%s\t%s\t%s\t\t%s\n','UN1','UN1','LN1','LN2','Inter','Domain','Gap Count','Sequences Names');
% % % % %             fprintf(fidOUTallowed,'%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t\t%s\n','UN1','UN1','LN1','LN2','Inter','Domain','Allowed Count','Isosteric','Sequences Names');

            %Matches(size(Sequences),length(Bases),length(Bases),length(InputData(:,1)),length(ChosenDomains))      =   0;

%             Bases   =   {'A' 'C' 'G' 'U' 'o' '-'};
%             BasesTot=   {'A' 'C' 'G' 'U' 'o' '-' 'Tot'};
            BPs(length(Bases),length(Bases))={'  '};                                                %Optimized for Speed%
            for i=1:length(Bases)
                for j=1:length(Bases)
                    BPs(i,j)=strcat(Bases(i),Bases(j)); %This generates a basepair matrix in the form AA, AC, AG, AU, Ao, A-, then on the second line CA, CC, CG,...
                end
            end

            NSeq_all    =   zeros(1,length(ChosenDomains));                                       %Optimized for Speed%
            NSeq        =   zeros(1,length(ChosenDomains));
            Count(length(BPs),length(BPs),length(InputData(:,1)),length(ChosenDomains))   =   0;  %Optimized for Speed%
            Percent(length(BPs),length(BPs),length(InputData(:,1)),length(ChosenDomains)) =   0;  %Optimized for Speed%
            SumTotal    =   zeros(length(InputData(:,1)),length(ChosenDomains));                  %Optimized for Speed%

            %fprintf('\n%s %s %s\n','Computing',PercentCharacter,'...');
            PercentDoneLast=-1;
            FirstTime=0;
            
% SeqName{length(BPs),length(BPs),length(InputData(:,1)),length(ChosenDomains)}=' '; %No speed advantage
            for n=1:length(InputData(:,1))

                PercentDone = round(n*100/length(InputData(:,1)));
                if PercentDone ~= PercentDoneLast, %Only print percent counter in case it changes, to save time
                    PercentDoneLast = PercentDone;
                    %if FirstTime==0,FirstTime=1; fprintf('\n\n\n\n'); end
                    if PercentDone==0,  fprintf('\n\n\n\n\n'); end
                    fprintf('\b\b\b\b\b%3g%s',   PercentDone,' %');
                    %if PercentDone<11,  fprintf('\b\b\b%g%s',   PercentDone,' %');
                    %else,               fprintf('\b\b\b\b%g%s', PercentDone,' %'); end
                end
                %if n/100 == fix(n/100),    fprintf('%g\n',n); end
                
                %PosArray(:,:,n)=sortrows(PosArray(:,:,n));
                for P=ChosenDomains
                    NSeq_all(P) = DomainLimits(P+1)-DomainLimits(P);
                    y = DomainLimits(P)+1;
                    z = DomainLimits(P+1);
                    %if z>y
                    for a=1:length(BPs)
                        for b=1:length(BPs)
                            %clear SeqName(a,b,n,P,:);
                            %Match(a,b,n,P,:)           =   strcmp(BPs(a,b),PosArray(y:z,:,n));            %Optimized for Speed% (strcmp is much faster than strmatch)
                            Count(a,b,n,P)  =   sum(strcmp(BPs(a,b),PosArray(y:z,:,n)));                   %Optimized for Speed%
                            
if Percentages==0
                            Amm             =  find(strcmp(BPs(a,b),PosArray(y:z,:,n)));
if Count(a,b,n,P)~=0,
SeqName{length(BPs),length(BPs),length(InputData(:,1)),length(ChosenDomains),Count(a,b,n,P)}=' ';    %Speeds up process
                            for x=1:Count(a,b,n,P)%length(Amm)
                                %MatchesPos(a,b,n,P,x)      =   Amm(x);                                    %Optimized for Speed%
                                SeqName(a,b,n,P,x) = FastaOrganismNames(Amm(x) + DomainLimits(P));
                            end
end
end
                            %if Count(a,b,n,P)~=0,
                            %   SeqName(a,b,n,P,1:Count(a,b,n,P))  =   FastaOrganismNames(MatchesPos(a,b,n,P,:)+DomainLimits(P));
                            %end
                        end
                    end
                    %end
                    % &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&This is the sum of GOOD sequences
                    NSeq(P)=sum(sum(Count(:,:,n,P)));%%%VERY IMPORTANT MODIFICATION, WILL CAUSE TROUBLE WHEN WE START DISPLAYING ONLY CERTAIN BASEPAIRS AND NOT ALL
                    % &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
                    for a=1:length(BPs)
                        for b=1:length(BPs)
                            Percent(a,b,n,P)=100*Count(a,b,n,P)/NSeq(P);%%%%%%%NSeq(P)
                            Percent(a,b,n,P)=round(Rounding*Percent(a,b,n,P))/Rounding;
                        end
                    end
                    %end

                    %for n=1:length(InputData(:,1))
                    SumColumns(:,:,n,P)   =   sum(Count(:,:,n,P));
                    SumRows(:,:,n,P)      =   sum(Count(:,:,n,P)');
                    SumTotal(n,P)         =   sum(SumColumns(:,:,n,P));    %NOT always fixed to closest decimal, regardless of the value of the variable "Rounding"

                    if Percentages== 1
                        SumColumns(:,:,n,P)   =   100*SumColumns(:,:,n,P)/NSeq(P);
                        SumColumns(:,:,n,P)   =   round(Rounding*SumColumns(:,:,n,P))/Rounding;
                        SumRows(:,:,n,P)      =   100*SumRows(:,:,n,P)/NSeq(P);
                        SumRows(:,:,n,P)      =   round(Rounding*SumRows(:,:,n,P))/Rounding;
                        SumTotal(n,P)         =   100*SumTotal(n,P)/NSeq(P);
                        SumTotal(n,P)         =   round(Rounding*SumTotal(n,P))/Rounding;
                    end

                    for a=1:length(BPs)
                        for b=1:length(BPs)
                            Expected(b,a,n,P) =   SumColumns(1,a,n,P)*SumRows(1,b,n,P)/SumTotal(n,P);
                            Expected(b,a,n,P) =   round(Rounding*Expected(b,a,n,P))/Rounding;
                        end
                    end
                end
            end

            
            if Percentages==0
                Isosteric               =   ones(length(InputData(:,1)),3);%length(ChosenDomains)); %%%3 instead of length(ChosenDomains)); because, what if you only chose A and E, without B? (The order is A,B,E)

                IsosterCompar           =   zeros(length(InputData(:,1)),3);
                ForbiddenCount          =   zeros(length(InputData(:,1)),3);
                GapCount                =   zeros(length(InputData(:,1)),3);
                AllowedCount            =   zeros(length(InputData(:,1)),3);

                ForbiddenPerc           =   zeros(1,3);
                ForbiddenCountPerc      =   zeros(1,3);
                GapsPerc                =   zeros(1,3);
                GapsCountPerc           =   zeros(1,3);
                AllowedPerc             =   zeros(1,3);
                AllowedCountPerc        =   zeros(1,3);

                IsostericANDForbidden   =   zeros(length(InputData(:,1)),3);
                IsostericANDAllowed     =   zeros(length(InputData(:,1)),3);
                IsostericANDGaps        =   zeros(length(InputData(:,1)),3);
                
                IsostericCount          =   zeros(length(InputData(:,1)),3);
                NearIsostericCount      =   zeros(length(InputData(:,1)),3);
                HeterostericCount       =   zeros(length(InputData(:,1)),3);

X = zeros(length(InputData(:,1)),3);
Y = zeros(length(InputData(:,1)),3);

                for P=ChosenDomains
                    for n=1:length(InputData(:,1))
                        if Table(n)~=14 % ~=14 means that a name (such as cWW or tHS) was given to the interaction n in the excel Positions list read by mGetPositions.m
                            mIsostericityFilter_Printer      %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
X(n,P) = x;     %These define the IsosterCompar cell
Y(n,P) = y;
                        else Isosteric(n,P)  =   0;
                            Allowed(n,P)     =   0;
                        end
                    end
                end
            end
            
            if ~isempty(find(strcmp('-',Bases)))
                Index           =   find(strcmp('-',Bases));
                
                for n=1:length(InputData(:,1))
                    for P=ChosenDomains
                        GapGapCount(n,P)    =   Count(Index,Index,n,P);
                    end
                end

            else
                %GapCount        =   zeros(length(InputData(:,1)),3); This will be done as a part of the counting itself
                GapGapCount     =   zeros(length(InputData(:,1)),3);
                if Percentages==0
                    fprintf('\n%s\n','ALERT: GAPS ARE NOT CONSIDERED, SCORES MAY BE UNRELIABLE');
                end
            end

            %wi=3; wni=2; wh=-1; wf=-2; wg1=-2; wg2=-3; wmax=max(wi,wni,wh,wf,wg1,wg2);
            for n=1:length(InputData(:,1))
                for P=ChosenDomains
                    Score(n,P) = round(Rounding*                    ...
                                 corc*(0                            ... %corc is the correction coefficient, set up in mReadSCOREweights.m (usually 100)
                        +   wi  *IsostericCount(n,P)                ... %Perfect score will be 100*3c (/3c at the end) %%%%so perfect is +100
                        +   wni *NearIsostericCount(n,P)            ...
                        +   wh  *HeterostericCount(n,P)             ...
                        +   wf  *ForbiddenCount(n,P)                ...
                        +   wg1 *(GapCount(n,P)-GapGapCount(n,P))   ...
                        +   wg2 *GapGapCount(n,P)                   ...
                        )/( (DomainLimits(P+1)-DomainLimits(P)) )   ... %Normalize by the number of sequences in this domain
                        )/Rounding;


% % %                     %%%Next code takes log of score, useful to infer probabilities
% % %                     Score(n,P) = log((0                             ... %corc is the correction coefficient, set up in mReadSCOREweights.m (usually 100)
% % %                         +   wi  *IsostericCount(n,P)                ... %Perfect score will be 100*3c (/3c at the end) %%%%so perfect is +100
% % %                         +   wni *NearIsostericCount(n,P)            ...
% % %                         +   wh  *HeterostericCount(n,P)             ...
% % %                         +   wf  *ForbiddenCount(n,P)                ...
% % %                         +   wg1 *(GapCount(n,P)-GapGapCount(n,P))   ...
% % %                         +   wg2 *GapGapCount(n,P)                   ...
% % %                         )/( (DomainLimits(P+1)-DomainLimits(P)) )   ... %Normalize by the number of sequences in this domain
% % %                         );
                end
            end
            
            for P=ChosenDomains
                TotScore(P) = round(Rounding*sum(Score(:,P))/length(InputData(:,1)))/Rounding; %This is the total score for each DOMAIN
            end
            
%             if Percentages==0              %So you do this only once, at the beginning
%                 if PlotScores ==1
%                     if any(Table~=14)
%                         mPlotBPScores
%                     end
%                 end
%             end
            
            %     mMutualCounterBPPrinter_HTML
            %     if length(ChosenDomains)>1
            %         mMutualCounterBPPrinter_HTML_COMB
            %     end
            mMutualCounterBPPrinter_HTML_1table
            if Percentages==0              %So you do this only once, at the beginning
                mMutualCounterBPPrinter_HTML_1table_Sequences
            else
                %mColoredSequenceTableHTML    %So you do this only once, at the end 
            end
            %mMutualCounterBPPrinter_Lines
            if GUspecialYES
                mMutualCounterBPPrinter_Special2GU
            end
            if Percentages==1 %Just report this once at the end
                if Table(n)~=14
                    %mIsostericityFilter_SummaryPrinter  %%%%%%%%%%
                    mIsostericityFilter_SummaryPrinter_10percent  %%%%%%%%%%
                end
            end
% % % % %             fclose(fidOUTforbidden);
% % % % %             fclose(fidOUTgaps);
            %         fclose(fidOUTallowed);
            if Percentages==1
                if SeqViewerYES
                    if sum(Table)~=14*length(Table) %this means that at least some of the rows include some real structural data
                        tic
%                         try
%                             run('Alignment_Viewer\mColorAlignment13_WriteHTML');                   noOutputColAlign=0;
                            mColorAlignment13_WriteHTML;                             noOutputColAlign=0;
%                         catch, fprintf('\n%s\n','mColorAlignment13_WriteHTML could not be run');    noOutputColAlign=1;
%                         end
                        if noOutputColAlign~=1
                            if toc < 30 %Open with Explorer only if file is small (if takes a short time to create), otherwise better open with Mozilla Firefox
                                if ispc ==1 %ispc returns 1 if this is run on a PC (Windows)
                                    eval(['winopen(''',OutputColAlign,''')'])
                                else eval(['open(''',OutputColAlign,''')'])
                                end
                            else
                                fprintf('\n%s\n','"_AlignView.html" file too big, it is advisable to open it with Mozilla Firefox');
                            end
                        end
                    else
                        fprintf('\n%s\n','mColorAlignment13_WriteHTML is impossible to run because BP list contains no structural data');
                    end
                end
            end

        else%%%%%%%%%%%%%%%%%%%In the case when we do not have a basepair list%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

            OutputFileM      = strcat(OutputPath,PositionList,'_',FastaFilename,'_COV_',PercentCharacter,'.txt');

            PrintArrayM = [];
            fidOUTM = fopen(OutputFileM,'w+');
            if fidOUTM == -1      %Error check loop
                ErrorOUTM = ['The file: ',OutputFileM,' could not be written to. It may be open. Please close it and try again'];
            else PrintArrayM = [PrintArrayM fidOUTM];
            end

            for f=1:length(PrintArrayM),
                fprintf(PrintArrayM(f),'\n\n%s\t\t\t%s\n','Fasta Alignment File',FastaFilename);
                fprintf(PrintArrayM(f),'%s\t\t\t%s\n','Positions Excel File',PositionList);
            end
            for n=1:length(InputData(:,1))
                if n/100 == fix(n/100),    fprintf('%g\n',n); end
                for f=1:length(PrintArrayM),
                    fprintf(PrintArrayM(f),'\n%g\n',n);
                    fprintf(PrintArrayM(f),'%s\t%s\n','Organism:',OrganismName{n});
                    fprintf(PrintArrayM(f),'%s','Nucleotides:');
                    for h=1:length(InputData(n,:))
                        fprintf(PrintArrayM(f),'\t%g',InputData(n,h));
                    end
                    fprintf(PrintArrayM(f),'\n');
                end
                for P=ChosenDomains
                    NSeq_all(P) = DomainLimits(P+1)-DomainLimits(P);
                    for f=1:length(PrintArrayM)
                        fprintf(PrintArrayM(f),'%s',DomainNames{P});
                        fprintf(PrintArrayM(f),'\t%s %g%s\n','(Number of sequences =',NSeq_all(P),')');
                    end
                    %PosArray(:,:,n)=sortrows(PosArray(y:z,:,n)); %NOOO if you sort here you mix up the domains in DomainLimits
                    y = DomainLimits(P)+1; %first sequence in the current DomainLimits (or domain, such as Archaea)
                    z = DomainLimits(P+1); %Last sequence

                    PosArray(y:z,:,n)=sortrows(PosArray(y:z,:,n)); %This is very important, this will allow counts to be printed out in alphabetical order and only once for each combination of letters with hits

                    % GUWC WCUG WCWC NTNT Gaps  %This is the special output required
% % %                     Count_SumTotal(n,P)=0;   Count_GUNT(n,P)=0;   Count_NTUG(n,P)=0;   Count_GUWC(n,P)=0;   Count_WCUG(n,P)=0;   Count_WCWC(n,P)=0;   Count_Gaps(n,P)=0;   Count_NTNT(n,P)=0;
% % %                     Percent_SumTotal(n,P)=0; Percent_GUNT(n,P)=0; Percent_NTUG(n,P)=0; Percent_GUWC(n,P)=0; Percent_WCUG(n,P)=0; Percent_WCWC(n,P)=0; Percent_Gaps(n,P)=0; Percent_NTNT(n,P)=0;

                    while y <= z
                        %for y=1:size(Sequences)      %size of Sequences gives both height and width of the array "Sequences", the FIRST being height (number of sequences) is used here
                        %eval(['Count.yn',PosArray(y,:,n),'=length(strmatch(PosArray(y,:,n),PosArray(:,:,n)));']);
                        Count(y,n)=length(strmatch(PosArray(y,:,n),PosArray(y:z,:,n)));%PosArray(y,:,n) is the first combination of letters seen. PosArray(y:z,:,n) is all letters in the required DomainLimits starting from y which is the current combination of letters. This will give the Count of occurences of the combination of letters (ex. AGCC) in the current DomainLimits (ex. Archaea)

                        Percent(y,n)    =   100*Count(y,n)/NSeq_all(P);
                        Percent(y,n)    =   round(Rounding*Percent(y,n))/Rounding;

                        %                 if strcmp(PosArray(y,1,n),'-'),mem='''';else mem='';end %This is for excel to understand that lines starting with - are not functions

                        if Percentages== 1
                            for f=1:length(PrintArrayM)
                                fprintf(PrintArrayM(f),'%s\t%s\t%g\n',PosArray(y,:,n),'=',Percent(y,n));
                            end
                        else
                            for f=1:length(PrintArrayM)
                                fprintf(PrintArrayM(f),'%s\t%s\t%g\n',PosArray(y,:,n),'=',Count(y,n));
                            end
                        end

% % %                         if length(InputData(1,:)) == 4,mSpecialOutput4Heart,end %1st of 2 SpecialOutput subfunctions
                        y=y+Count(y,n); %This will make the counter jump over the combination if letters that are exactly like the previous "Counted" ones
                    end
                end
            end
% % %             if length(InputData(1,:)) == 4,mSpecialOutput4Printer,end %2nd of 2 SpecialOutput subfunctions

            if fidOUTM ~= -1
                status = fclose(fidOUTM); %Closing the .out file
%                 fprintf('\n%s\n%s\n','The full output data was saved in the working directory under the name:',OutputFileM);
            else%%%%%%%%%Reporting errors at the end
                ErrorOUTM
                if Audio == 1
                    try
                    [Ys,Fs] = wavread(['..',filesep,'Audio',filesep,'ohno']);
                    sound(Ys,Fs); %wavplay(Ys,Fs,'async')
                    end
                end
            end
        end
        %fclose all
        if Percentages==1
            if Audio == 1
                try
                [Ys,Fs] = wavread(['..',filesep,'Audio',filesep,'done1']);
                sound(Ys,Fs); %wavplay(Ys,Fs,'async')
                end
            end
        end
        %end
        %toc
    else
        if Percentages==0 %since now We analyze immediately for both counts and Percentages (Percentages=0 and 1), there's no sense in printing the ohno message twice
            fprintf('\n%s\n%s\n','You have chosen to analyze more domains than available in your Fasta file.','Please chose less domains and try again.');
            if Audio == 1
                try
                [Ys,Fs] = wavread(['..',filesep,'Audio',filesep,'ohno']);
                sound(Ys,Fs); %wavplay(Ys,Fs,'async')
                end
            end
        end
    end
else
    if Percentages==0 %since now We analyze immediately for both counts and Percentages (Percentages=0 and 1), there's no sense in printing the ohno message twice
        fprintf('\n%s\n%s\n','One or more of the organisms in your Excel Positions List file could not be found among your Fasta sequences');
        if Audio == 1
            try
            [Ys,Fs] = wavread(['..',filesep,'Audio',filesep,'ohno']);
            sound(Ys,Fs); %wavplay(Ys,Fs,'async')
            end
        end
    end
end

if Percentages==1
    fprintf('\n%s','Check for output files in the Output directory:');
    fprintf('\n%s\n',OutputPath);
end
