%% fig3 sup1 a
gapmedian_bird1=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird1_gap.csv')
gapmedian_bird2=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird2_gap.csv')
gapmedian_bird3=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird3_gap.csv')
gapmedian_bird4=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird4_gap.csv')
gapmedian_bird5=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird5_gap.csv')
gapmedian_bird6=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird6_gap.csv')
gapmedian_bird7=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird7_gap.csv')

delTE_bird1=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird1_TE.csv')
delTE_bird2=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird2_TE.csv')
delTE_bird3=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird3_TE.csv')
delTE_bird4=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird4_TE.csv')
delTE_bird5=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird5_TE.csv')
delTE_bird6=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird6_TE.csv')
delTE_bird7=csvread('D:\analysis\data_for_elife_mMAN\data\Figure3_figuresupplement1_sourcedata1\bird7_TE.csv')
%% statistics
gapmedian_pre=[gapmedian_bird1(:,1);gapmedian_bird2(:,1);gapmedian_bird3(:,1);...
    gapmedian_bird4(:,1);gapmedian_bird5(:,1);gapmedian_bird6(:,1);gapmedian_bird7(:,1)];
delTE=[delTE_bird1;delTE_bird2;delTE_bird3;delTE_bird4;...
    delTE_bird5;delTE_bird6;delTE_bird7];
[correlation,pval]=corr(gapmedian_pre,delTE);
% calculating and plotting regression line
x=gapmedian_pre;
X=[ones(length(x),1),x];
y=delTE;
b=X\y;
yCalc1=X*b;
Rsq=1-sum((y-yCalc1).^2)/sum((y - mean(y)).^2);
%% plotting
figure('Name','gap duration vs change in TE')
title('del Transition entropy vs gap duration')
hold on
scatter(gapmedian_bird1(:,1),delTE_bird1,'o')
scatter(gapmedian_bird2(:,1),delTE_bird2,'+')
scatter(gapmedian_bird3(:,1),delTE_bird3,'x')
scatter(gapmedian_bird4(:,1),delTE_bird4,'s')
scatter(gapmedian_bird5(:,1),delTE_bird5,'d')
scatter(gapmedian_bird6(:,1),delTE_bird6,'^')
scatter(gapmedian_bird7(:,1),delTE_bird7,'p')
yline(0,'k--')
xlabel('Pre lesion gap duration (ms)')
ylabel('Change in transition entropy within chunk')
plot(x,yCalc1,'--k')
txt1={['Corr.coef.=',num2str(correlation)],['p=',num2str(round(pval,3))],['Rsq=',num2str(round(Rsq,3))]}
text(90,1,txt1);
%text(90,1.1,txt2)
hold off
%% helper funcs
function [naiveoverallentropy,repoverallentropy]=calc_overallentropy(prelesionseq,postlesionseq,prelesionchunkseq,postlesionchunkseq)
% this function calculates overall transition entropy for sequences before
% and after the chi sq analysis. I have made it for the MMAN paper so that I
% can easily compare entropies before and after lesion
% inputs: prelesionseq, postlesionseq are naive sequences, usually with
% repeats removed (because for chi sq analysis you have to remove repeats
% and also that's what they do in the katahira 2013 paper. repeats = syl
% repeating > 2 times).
% prelesionchunkseq and postlesionchunkseq are outputs from the
% ak_chunkextraction function which gives you the seq with states replaced,
% but NOT chunks replaced. repeats are removed before giving the input to
% this function
% transition entropy is for all syllables, not just the branch points

% calculate overall entropy
% first use un-replaced, naive sequences
syls=unique(prelesionseq);
for ij=1:length(syls)
    labelspre{ij}=syls(ij);
end
[~,~,rawmtx]=calctransitionprob_fromsequence(labelspre,prelesionseq,1);
[~, preoverallte] = transentropy(rawmtx);
syls=unique(postlesionseq);
for ij=1:length(syls)
    labelspost{ij}=syls(ij);
end
[~,~,rawmtx]=calctransitionprob_fromsequence(labelspost,postlesionseq,1);
[~, postoverallte] = transentropy(rawmtx);
naiveoverallentropy=[preoverallte,postoverallte];

% now use the chi-sq analysis, replaced sequences
syls=unique(prelesionchunkseq);
for ij=1:length(syls)
    labelspre{ij}=syls(ij);
end
[~,~,rawmtx]=calctransitionprob_fromsequence(labelspre,prelesionchunkseq,1);
[~, r_preoverallte] = transentropy(rawmtx);

syls=unique(postlesionchunkseq);
for ij=1:length(syls)
    labelspost{ij}=syls(ij);
end
[~,~,rawmtx]=calctransitionprob_fromsequence(labelspost,postlesionchunkseq,1);
[~, r_postoverallte] = transentropy(rawmtx);
repoverallentropy=[r_preoverallte,r_postoverallte];
end

function[testcell,divprobtoplot,TPnew,freq]=calctransitionprob_fromsequence(labels,seq,eliminaterare)
% testcell= syl positions for divprob
% divprobtoplot = transition matrix
% TPnew = rawmtx/matrix of transition counts
% freq = frequency of the chunks
for i=1:length(labels)
    for j=1:length(labels)
        testcell{i,j}=[labels{i}, labels{j}];
        countcell{i,j}=length(regexp(seq,testcell{i,j}));
    end
end
TPnew=cell2mat(countcell);
% according to Lena's logic I should be only summing over rows
% tot=sum(nonzeros(TPnew));
tot=sum((TPnew),2);
sumcols=sum(TPnew,1);
grandtot=sum(tot);
freq=sumcols/grandtot;

if eliminaterare
%eliminate rare notes
    rarenodes = tot./sum(tot)<0.01; %0.025;
    rarenodes=rarenodes(rarenodes~=1); %so that the start is not removed
    tot(rarenodes)=0;
    TPnew(rarenodes,:)=0;
    TPnew(:,rarenodes)=0;
end

divprob=(TPnew)./repmat(tot,1,length(TPnew));
divprob = round(divprob*100);
divprobtoplot = divprob;
divprobtoplot(isnan(divprobtoplot))=0;
end

function [h,chi2,p]=chisq_2dist(dist1,dist2,alpha)
%chisq gof
%source= https://www.youtube.com/watch?v=Cert35F-w4c
%dist1=observed values
%dist2=expectedvalues
num=find(dist1==0);
dist1(num)=[]; %removing zeros based on obs values
dist2(num)=[]; %removing zeros based on obs values
if isrow(dist1)
    dist1=dist1';
end
if isrow(dist2)
    dist2=dist2';
end
df=length(dist1)-1;
obs=[dist1,dist2];
rowtot=sum(obs,2); %along columns
coltot=sum(obs,1);
tot=sum(sum(obs));
exp=zeros(size(obs));
for i=1:size(obs,1)
    for j=1:size(obs,2)
        exp(i,j)=rowtot(i)*coltot(j)/(tot);
    end
end
diff=obs-exp;
diff2=diff.^2;
normdiff2=diff2./exp;
chi2=sum(sum(normdiff2));
p=1-chi2cdf(chi2,df);
if (~exist('alpha','var'))
    alpha=0.01;
end
if p<alpha
    h=1;
else
    h=0;
end
end

function [con]=chunkconsistency(seq,expr,len)
%specify length of expr if you're using something like '[lB].'
% find consistency per chunks 14.07.22 Avani
if ~exist('len','var')
    len=length(expr);
end
point=(regexp(seq,expr));
%lena version
list2 = [];
for i = 1:length(point)
    list2 = [list2; seq(point(i):(point(i)+len-1))];
end
list=list2;
%what should Y char be? It should be simply LESS in double value than all
%unique syls in the sequence.
unq=unique(seq);
doubs=double(unq);
mindoubs=min(doubs);
chardoub=mindoubs-1;
Ychar=char(chardoub);

Yarray=[];
for ij=1:length(point')
    Yarray(ij)=chardoub;
end

newlist=[Yarray' list];
longstr='';
for ik=1:length(newlist)
    longstr=[longstr newlist(ik,:)];
end

%convert to cell array for labels
labels={};
unq=(unique(longstr));
for ix=1:length(unq)
    labels{ix}=unq(ix);
end
[a,divprob,TPnew]=calctransitionprob_fromsequence(labels,longstr,0);
% remove first row and first column
TPnew2= TPnew(2:end,2:end);
con = sum(max(TPnew2,[],2))./sum(sum(TPnew2));
end

function[paths]=chunkextraction(g,thresh)
% find continuous path of 80% nodes
% seprate nodes as one in one out, multiple in multiple out,
% one in multiple out, multiple in one out
oimo=[];
mioo=[];
mimo=[];
oioo=[];
for i=1:g.numnodes
    [~,nIN]=inedges(g,i);
    [~,nOUT]=outedges(g,i);
    if length(nIN)==1 & length(nOUT)>1
        oimo=[oimo,i];
    elseif length(nIN)==1 & length(nOUT)==1
        oioo=[oioo,i];
    elseif length(nIN)>1 & length(nOUT)==1
        mioo=[mioo,i];
    else mimo=[mimo,i];
    end
end
nexplored=[];
paths={};
% use nodes in mioo (mult in one out) as starter nodes
for ij=1:length(mioo)
    v=mioo(ij); %also consider starting at 1
    [eid,w]=outedges(g,v);
    chkpath=[v];
    nexplored=[nexplored,v]; 
    u=w;
    if ismember(w,oioo)
        while g.Edges(eid,:).Weight>=thresh && ismember(w,oioo) %only oioo paths %TODO: oimo paths
            chkpath=[chkpath,w];
            [eidu,u]=outedges(g,w);
            nexplored=[nexplored,w];
            w=u;
            eid=eidu;     
        end
        nexplored=[nexplored,u]; %its okay if some us are double
        if g.Edges(eid,:).Weight>=thresh && ismember(u,oimo)
            chkpath=[chkpath,u];
        end
        paths=[paths,chkpath];
    elseif g.Edges(eid,:).Weight>=thresh && ismember(w,oimo)
        nexplored=[nexplored,w];
        chkpath=[chkpath,u];
        paths=[paths,chkpath];
    end
end
%% for chunks starting from v=Y
newset=setdiff(oioo,nexplored); %unexplored oioo nodes
name_start=findnode(g,'Y'); %%here you find the num of node Y
% I think I should force name_start to be considered here
newset=[newset,name_start];
newset=unique(newset); %in case name_start was already in the set

for ik=1:length(newset) %are sorted by seq anyway
    v=newset(ik);
    [~,k]=inedges(g,v);
    if v==name_start %%here
       [eid,w]=outedges(g,v);
       chkpath=[v];
       nexplored=[nexplored,v];
       u=w;
        if ismember(w,oioo)
            while g.Edges(eid,:).Weight>=thresh && ismember(w,oioo) %only oioo paths %TODO: oimo paths
                chkpath=[chkpath,w];
                [eidu,u]=outedges(g,w);
                nexplored=[nexplored,w];
                w=u;
                eid=eidu;     
            end
            nexplored=[nexplored,u]; %its okay if some us are double
            if g.Edges(eid,:).Weight>=thresh & ismember(u,oimo)
            chkpath=[chkpath,u];
            end
            paths=[paths,chkpath];
        elseif g.Edges(eid,:).Weight>=thresh & ismember(w,oimo)
            nexplored=[nexplored,w];
            chkpath=[chkpath,u];
            paths=[paths,chkpath];
        end
    end
    if v~=name_start && ~ismember(k,oioo) & ~ismember(v,nexplored) %%here
        [eid,w]=outedges(g,v);
        chkpath=[v];
        nexplored=[nexplored,v];
        u=w;
        if ismember(w,oioo)
            while g.Edges(eid,:).Weight>=thresh & ismember(w,oioo) %only oioo paths %TODO: oimo paths
                chkpath=[chkpath,w];
                [eidu,u]=outedges(g,w);
                nexplored=[nexplored,w];
                w=u;
                eid=eidu;     
            end
            nexplored=[nexplored,u]; %its okay if some us are double
            if g.Edges(eid,:).Weight>=thresh & ismember(u,oimo)
            chkpath=[chkpath,u];
            end
            paths=[paths,chkpath];
        elseif g.Edges(eid,:).Weight>=thresh & ismember(w,oimo)
            nexplored=[nexplored,w];
            chkpath=[chkpath,u];
            paths=[paths,chkpath];
        end
    end
end
end

function [histdep]=historydependence_bp(seq,bp,len)
    % function for history dependence for a given seq and branchpoint pair
    % this function will be used in ak_transent_postl
    % could amke it into a separate function also
    % find the most probable transition 'ab'
    if ~exist('len','var')
        len = length(bp);
    end
    try
        nextsylstr=seq(regexp(seq,bp)+len); %so that you can put a regexp
    catch
        seq=seq(1:end-len);
        nextsylstr=seq(regexp(seq,bp)+len);
    end
    alltrans=unique(nextsylstr);
    counts=zeros(1,length(alltrans));
    for i=1:length(alltrans)
        str=[bp,alltrans(i)];
        counts(i)=length(regexp(seq,str));
    end
    sylcounts=sum(counts);
    probmtx = counts./repmat(sylcounts,1,size(counts,2));
    b=alltrans(probmtx==max(probmtx));
    findabadot=nextsylstr(strfind(nextsylstr(1:end-1),b)+1); %find all transitions after the dominant transition
    countabab=length(strfind(findabadot,b));%count how many times 'b' appears given that 'b' is the transitin at n-1
    probabab=countabab/length(findabadot); % prob of ab|ab = #ab|ab/#a-all|ab
    %for the second part, find all transitions that are NOT ab
    acpos=regexp(nextsylstr(1:end-1),['[^',b,']']); %all transitions that are NOT ab (here called ac)
    findacadot=nextsylstr(acpos+1); %find all transitions if ac is the transition previously
    countacab=length(strfind(findacadot,b)); % fing out how many of those are ab
    probacab=countacab/length(findacadot); % prob of ab|ac = #ab|ac/#a-all|ac

    % find prob of (ab at n|ab at n-1)
    % find prob of (ab at n|ac at n-1) where ac is any other transition
    % from a ie a-other
    % histdep=abs(p(ab at n|ac at n-1)-p(ab at n|ab at n-1))
    if isnan(probacab)
        probacab = 0;
    end
    if isnan(probabab)
        probabab = 0;
    end
    histdep=abs(probacab-probabab);

end

function[chunks2,percY,seqforchunks,chunks2replace,labelidx,newseq,divprobtoplot2,patterncell2,labels2,numnewsyls]=seq_chunkextractionfunc(seqnew,~)
% This function tried to automate extraction of chunks based on chi sq
% analysis. First, we try to determine if a syllable's next transition
% depends on the syllable that comes before it using chi sq analysis. If it
% does, we relable the syllable and use it as a different 'state' of the
% same syllable. Then using my chunk exrtraction function, we create chunks
% with one-in-one-out branches, >80% transition prob as middle nodes.
% Bonferroni correcion added where alpha = 0.01/n where n = number of
% comparisons made 08.03.2023
% Detailed notes are in the description of the function. 
% This function gives you chunks and plots old transition diagram and new
% transition diagram automatically.
% Input= seqnew, a clean sequence string with intro notes replaced and
% start symbol 'Y' already present
% if you dont want a plot write 0 for plotting
% you need seqforchunks and chunks2replace for chunk consistency analysis
% and labelidx also
% newseq gives you the seq w/ chunks replaced 
% divprobtoplot2 = final transitionprob matrix
% and patterncell2 gives you the patterncell for the diagraph
% labels2 = what are the replaced labels
% numnewsyls = number of additional states
%%
unq=unique(seqnew);
% for all syllables
% probabilities of all syls to everything
for i=1:length(unq)
    for j=1:length(unq)
        transmat{i}{j}=[unq(i),unq(j)];
        countst{i}(j)=length(strfind(seqnew,[unq(i),unq(j)])); % counting WITH overlab
    end
    probst{i}=countst{i}/sum(countst{i});
end
%%
% prob of all syls to everything given each syl before
% make matrix of all by all
for xi=1:length(unq)
    for xj=1:length(unq)
        mat{xi,xj}=[unq(xi),unq(xj)];
    end
end
%%
for yi=1:length(unq)
    for yj=1:length(unq)
        for yk=1:length(unq)
            test{yi,yj}{yk}=[mat{yi,yj},unq(yk)]; %every column is x-yoursyl-y
            countstg{yi,yj}(yk)=length(strfind(seqnew,[mat{yi,yj},unq(yk)]));
        end
    end
end
%% chisq test
h = nan(length(unq));
chi = nan(length(unq));
p = nan(length(unq));
countst2=countst; %saving countst before I modify it to delete linear branches
% remove increadibly small branches which are likely mislabels
for ix=1:length(countst)
    num=countst{ix}./sum(countst{ix});
    pink=countst{ix};
    pink(num<0.01)=0;
    countst{ix}=pink;
end
nzeros=cellfun(@(x) length(nonzeros(x))>1,countst, 'UniformOutput',0);
nz2=cellfun(@(x) x==1,nzeros);
countst(~nz2)={0};
% remove all comparisons of column 1
colY=strfind(unq,'Y'); % this is the column of countst which corresponds to Y
countst{colY}=0; %because Y doesnt actually depend on anything
% filtering out branches of countstg which are below 0.1% of the total
% transitions
testcountstg=countstg; % 'saving' countstg
%testcountstg is the one that stays intact; countstg will be modified;
%go through all cells and delete whole cells if sum within the cell is less
%than 1% of the total times this syllable is observed
sumcols=sum(cellfun(@(x) sum(x),testcountstg),1);
for i=1:size(countstg,1)
    for j=1:size(countstg,2)
        if sum(testcountstg{i,j})/sumcols(j)>0.01 %filtering out cells of countstg which appear very rarely %10%
            newtestcnt{i,j}=testcountstg{i,j};
        else newtestcnt{i,j}=[];
        end
    end
end
countstg=newtestcnt;

% carrying out chi sq test:
for zi=1:length(unq)
    existingpostsyls = find(countst{zi}./sum(countst{zi})>0.01); %vector which gives indices of not-rare transitions; >0.01
    for zj=1:length(unq)
            %numstates should be number of possibilities of
            %preceeding syl
            numstates=sum(cellfun(@(x) ~isempty(x),countstg(:,zj)));
        if ~isempty(countstg{zj,zi})
            if any(countstg{zj,zi}(existingpostsyls)./sum(countst{zi}(existingpostsyls))>0.01)
                if length(nonzeros(countstg{zj,zi}(existingpostsyls)))>1 %because you need at least a vector of 2 for chi sq test
                    %if this conditional branch is not 1% f the total branches of
                    %that syllable 
                    testnew{zj,zi}=test{zj,zi}(existingpostsyls);
                    [h(zj,zi),chi(zj,zi),p(zj,zi)] = chisq_2dist(countst{zi}(existingpostsyls),countstg{zj,zi}(existingpostsyls),0.01/numstates);
                    %changed on 08.03.23, bonferroni correction
                    %if zi =1, first element of countst ie YY,Ya,Yb etc filtered to nonzero
                    %transitions vs all rows of countstg over first column (ie
                    %YYY,YYa,YYb etc) filtered to nonzero transitions
                elseif length(nonzeros(countstg{zj,zi}(existingpostsyls))) == 1
                    h(zj,zi) = 1; %automatically, it becomes a separate state in itself
                end
            end
        end
    end
end

%% keep only the ones with h=1
[a,b]=find(h==1);
cellremaining=cell(length(unq));
for i=1:length(a)
    cellremaining{a(i),b(i)}=mat{a(i),b(i)};
end
%% now do the relabelling thing
% find letters not present in unq
allletters=[char(97:122),char(65:90),'0123456789']; %in case some extra chars are needed, i added char 60:64
avletters1=allletters(~ismember(allletters,unq)); %available letters
avletters=avletters1(end:-1:1); %just to make them more noticable when replaced

%% collecting indices for replacing and replacing together
states=cellfun(@(x) ~isempty(x),cellremaining);
sumstates=sum(states,1);
torelabel=cellremaining(:,sumstates>=1);

tt=1; %idc for avletters
for ti=1:size(torelabel,2)
    stateidx=find(~(cellfun('isempty',torelabel(:,ti)))); %index of combos to relabel in the ti row
    for tj=1:length(stateidx)
        repseq=torelabel{stateidx(tj),ti};
        labelidx{tt,1}=strfind(seqnew,repseq)+1; %because you're looking to replace the 2nd syl
        labelidx{tt,2}=[avletters(tt)];
        labelidx{tt,3}={[repseq(2),num2str(tj)]};
        labelidx{tt,4}={repseq};
        tt=tt+1;
    end
end
%% see if the replaced syls actually are different WITHIN themselves
[a,b]=find(h==1);
remcountstg=cell(size(countstg));
for i=1:length(a)
    remcountstg{a(i),b(i)}=countstg{a(i),b(i)}; %remcountstg = counstg where h==1 ie the dists are different
end
states2=cellfun(@(x) ~isempty(x),remcountstg);
sumstates=sum(states2,1);
remcountstg=remcountstg(:,sumstates>1);
which_to_merge=cell(size(remcountstg));
names_to_merge=cell(size(remcountstg));
for ip=1:size(remcountstg,2)
    emptidx=~(cellfun('isempty',remcountstg(:,ip)));
    x=remcountstg(emptidx,ip);
    t=cellfun(@(y) y./sum(y),x,'UniformOutput',false); %actual percentages are stored here for a
    a=cellfun(@(y) find((y./sum(y))>0.05),x,'UniformOutput',false); %branches below 5% are ignored
    f=cellfun(@(y) find((y./sum(y))>0.05),remcountstg(:,ip),'UniformOutput',false); 
    a = a(:);
    [~,b,c] = unique(cellfun(@char,a,'un',0));
    lo = histc(c,1:max(c));
    loo = lo(:) > 1;
    out = [a(b(loo))];
    % condition for if length(out{ir}) = 1 has been added
    for ir=1:length(out)
        index=[];
        for k = 1:numel(f)
            try
                if f{k} == cell2mat(out(ir))
                    ind = (f{k} == cell2mat(out(ir)));
                    index=[index;k]; % found indices of remcountstg of cells with this particular output 
                end
            end
        end
        if length(out{ir})>1 %then do all the testing
            % test these against each other
            % what if there are more than 2 options? unlikely, but still..
            pairwise_combinations=nchoosek(index,2); %here you get combinations of indices
            numstates2=length(pairwise_combinations)/2; %modified on 31.03.2023
            for id=1:size(pairwise_combinations,1) %along rows
                data1=remcountstg{pairwise_combinations(id,1),ip};
                data2=remcountstg{pairwise_combinations(id,2),ip};
                h_testing_in_pairs=chisq_2dist(data1,data2,0.01/numstates2); %modified on 08.03.2023
                if h_testing_in_pairs == 0
                    which_to_merge{ir,ip}=[which_to_merge{ir,ip};pairwise_combinations(id,1),ip;pairwise_combinations(id,2),ip];
                end
            end
        elseif length(out{ir})==1 %merge them automatically if there is only one major branch
            for ig=1:length(index)
                which_to_merge{ir,ip}=[which_to_merge{ir,ip};index(ig),ip]; %make vector of all indices + col number
            end
        end
        which_to_merge{ir,ip}=unique(which_to_merge{ir,ip},'rows');
        try %in case which_to_merge_is empty
        rows=which_to_merge{ir,ip}(:,1);
        cols=which_to_merge{ir,ip}(:,2);
        names_to_merge{ir,ip}=[];
            for ix=1:length(rows)
                names_to_merge{ir,ip}=[string(names_to_merge{ir,ip});string(torelabel{rows(ix),cols(ix)})];
            end
        end
    end
end
% node for which_to_merge: ip= which COLUMN of
% torelabel to merge,go over ALL ir's; those are the
% different TYPES of combos available. Everything with the indices that sit
% inside which_to_merge{ir,ip} should be the SAME syllable 
%% now prune labelidx based on your which_to_merge matrix
% go over every element of names_to_merge
for num_merg=1:numel(names_to_merge)
    thiscell=names_to_merge{num_merg};
    repidx=[];
    for idx=1:length(thiscell) %because 2 or more syllables might be alike and in need of merge
        str=thiscell(idx);
        for collen=1:size(labelidx,1)
            if strfind(labelidx{collen,4}{:},str)
                repidx=[repidx;collen];
            end
        end
    end
    if ~isempty(repidx)
        torepwith=labelidx{repidx(1),3};
        torepwith2=labelidx{repidx(1),2};
        for replen=2:length(repidx)
            labelidx{repidx(replen),3}=torepwith;
            labelidx{repidx(replen),2}=torepwith2;
        end
    end
end
%% replacing stuff
for xi=1:size(labelidx,1) %along rows
    seqnew(labelidx{xi,1})=labelidx{xi,2};
end
% this is the seqnew I'd want for my chunks consistency analysis
seqforchunks=seqnew;
%%
[a,freq]=uniquestring(seqnew,0.9); % was 0.9 HERE changed from 0.05 to 1 for bubu but bk2bk10 needs <1
labels=cell(1,length(a));
%unq=unique(seqnew);
for unqidx=1:length(a)
    labels{unqidx}=a(unqidx);
end
%% create patterncell
patterncell=labels;
for ix=1:size(labelidx,1)
    idx=strcmp(labels,labelidx{ix,2});
    patterncell(idx)=labelidx{ix,3};
end
%%
[~,divprobtoplot,~]=calctransitionprob_fromsequence(labels,seqnew,1);
% delete tiny percentage branches from divprobtoplot before making the
% graph
%divprobtoplot(abs(divprobtoplot)<4)=0; % 5% and above branches only

g=seq_plot_digraph(divprobtoplot,patterncell,'TGraph','PRE',freq*500);
for im=1:length(patterncell)
    labelsnum{im}=num2str(im);
end
%g = digraph(divprobtoplot,labelsnum)
if ~exist('plotting','var')
f=seq_plot_digraph(divprobtoplot,labelsnum,'TGraph','PRE',freq*500);
%%uncomment this only if you want to troubleshoot the graph and want to
%%know the node numbers quickly

seq_plot_transitionmatrix(divprobtoplot,patterncell,1,'TMat','PRE'); 
end
%% chunk extraction
try
[paths]=chunkextraction(g,80); 
chunks = cellfun(@(x) [patterncell(x)],paths,'UniformOutput',false);
chunks2=cellfun(@(x) [x{:}],chunks,'UniformOutput',false); %chunk names to plot
[~,uidx] = unique(chunks2,'stable');
chunks2=chunks2(uidx);
chunks2plot=cellfun(@(x) [labels(x)],paths,'UniformOutput',false);
chunks2replace=cellfun(@(x) [x{:}],chunks2plot,'UniformOutput',false); %chunk names to replace
chunks2replace=chunks2replace(uidx);
%% create transition matrix
[newseq,pp,newchunknames] = ak_replacechunks(seqnew,chunks2replace);

%% create labelcell2
[a,freq]=uniquestring(newseq,0);
% adding a condition such that the Yi chunk is never removed
chunk_with_Y=cellfun(@(x) strfind(x,'Y'),pp,'UniformOutput',false);
num_Y=find(~cellfun(@isempty,chunk_with_Y));
Y_in_a=strfind(a,newchunknames(num_Y));
freqY=freq(Y_in_a);
percY=freqY*100; %in percentage

%here the threshold is changed such that Yi HAS to be in the graph.

[a,freq]=uniquestring(newseq,percY*0.70); %changed by Avani and Lena on 31.03.2023
labels2=cell(1,length(a));
for unqidx=1:length(a)
    labels2{unqidx}=a(unqidx);
end
[~,divprobtoplot2,~]=calctransitionprob_fromsequence(labels2,newseq,0);
%% create patterncell2 to plot in final transition graph
patterncell2=labels2;
for ix=1:size(labelidx,1)
    idx=strcmp(labels2,labelidx{ix,2});
    patterncell2(idx)=labelidx{ix,3};
end
% replace chunks to plot in final transition graph
for iy=1:length(newchunknames)
    %Here i use try in case one of the chunks is removed during freqency thresholding in line 250
    try
    idx2=strcmp(patterncell2,newchunknames(iy)); %for example, found pos of A
    patterncell2{idx2}=chunks2{iy};
    end
end
if ~exist('plotting','var')
% plotting
    seq_plot_digraph(divprobtoplot2,patterncell2,'TGraph','PRE',freq*100);
    seq_plot_transitionmatrix(divprobtoplot2,patterncell2,1,'TMat','PRE');
end
% i need an output seq which has all the 'new' states replaced
%% how many additional states
numnewsyls=length(unique(seqforchunks))-length(unq);
catch
    '!!!!SOME MASSIVE ERROR!!!!'
    chunks2replace=[];
    newseq=0;
    divprobtoplot2=0;
    patterncell2=[];
    labels2=[];
    numnewsyls=0;
    chunks2=0;
    percY=0;
end
end

function g = seq_plot_digraph(divprob,patterncell,title1,preorpost,freq,thresh)
% plots directed graph. works for con and div.
%freq=frequency of notes

divprobtoplot = divprob;
divprobtoplot(abs(divprobtoplot)<5) = 0; %eliminate tiny branches
divprobtoplot(isnan(divprobtoplot))=0;


g = digraph(divprobtoplot,patterncell);%postpatterns
width = 0.05*abs(g.Edges.Weight);
g.Edges.Weight
% width =2;
f=figure;
% p = plot(g,'layout','circle','edgelabel',g.Edges.Weight,'linewidth',width,'arrowsize',18);%

p = plot(g,'layout','circle','edgelabel',g.Edges.Weight,'linewidth',width,'arrowsize',15,'Edgefontsize',18,'nodefontname','arial','edgefontname','arial','nodefontsize',18,'edgealpha',1,'nodecolor',[0.7 .85 1],'markersize',40,'edgefontweight','bold','nodefontweight','bold');%
p.EdgeColor=[0,0,0];
p.NodeColor=[0.8,0.8,0.8];
title(title1)
f.Name=[preorpost '_' 'TGraph'];
if exist('freq','var')
    p.MarkerSize=freq;
end
% p = plot(g,'EdgeLabel',g.Edges.Weight)

% h = get(gca,'children');
% set(h,'Arrowsize',15)
axis off
end

function seq_plot_transitionmatrix(mtx,labels,cm,title1,preorpost)
% cm = 1: one direction for colormap; cm=2 two directions out from 0 

% modified from Lucas in get all trans prob function

f=figure; 
f.Name=[preorpost '_' 'Tmatrix'];
hold on;

syl_num=size(mtx,2);

plotmtx = mtx;
plotmtx(isnan(mtx))=0;
imagesc(plotmtx);

switch cm
    case 1
        colormap(flipud(gray))
    case 2
        % imagesc(mtx,'AlphaData',~isnan(mtx))
        
        colo = [[ones(15,1); linspace(1,0.1,15)'] [linspace(0,1,15)'; linspace(1,0,15)'] [linspace(0.3,1,15)'; ones(15,1)]];
    colormap(colo)

    otherwise
        colormap(gray)
end
textStrings = num2str(mtx(:),'%.0f');
textStrings(isnan(mtx),:)='-';
textStrings = strtrim(cellstr(textStrings));
[x,y]=meshgrid(1:syl_num);
hStrings = text(x(:),y(:),textStrings(:), 'HorizontalAlignment','center','Fontname','arial','fontsize',18,'fontweight','bold');
midValue = mean(get(gca,'CLim'));
textColors = repmat(mtx(:) > midValue,1,3);
set(hStrings,{'Color'},num2cell(textColors,2));
set(gca,'XTick',1:syl_num,'XTickLabel',labels, 'YTick',1:syl_num,...
    'YTickLabel',labels, 'TickLength',[0 0],'fontname','arial','fontweight','bold','fontsize',18);
ylabel('transition from:')
xlabel('transition to:')
title(title1)
end

function [ent,histdepall]=transent_prevspost(prelesionseq,bpprelesion,postlesionseq,bppostlesion,lenv)
    % in prelesion seq find all of these branchpoints
    % calculate all possible transitions at these branchpoints and calculate
    % the raw numbers
    % using the raw numbers calculate transition entropy per branchpoint;
    % if you're giving regularexp anywhere, specify all lengths of
    % expressions in the lenv vector
    % old code: 
    % probmtx = rawmtx./repmat(sum(rawmtx,2),1,size(rawmtx,2));
    % rawmtx(probmtx<0.01) = 0;
    % sylcounts = sum(rawmtx,2);
    % 
    % probmtx = rawmtx./repmat(sylcounts,1,size(rawmtx,2));
    % 
    % 
    % te = nansum(-probmtx.*log2(probmtx),2)
    % I am also including history dependence here; should probably rename
    % the function later
    ent=zeros(length(bpprelesion),2);
    histdepall=zeros(length(bpprelesion),2);
    for ij=1:length(bpprelesion)
        if exist('lenv','var')
            ent(ij,1)=transitionentropy_bp(prelesionseq,bpprelesion{ij},lenv(ij));
            if ent(ij,1)==0
                histdepall(ij,1)=0;
            else
                histdepall(ij,1)=historydependence_bp(prelesionseq,bpprelesion{ij},lenv(ij));
            end
        else
            ent(ij,1)=transitionentropy_bp(prelesionseq,bpprelesion{ij});
            if ent(ij,1)==0
                histdepall(ij,1)=0;
            else
                histdepall(ij,1)=historydependence_bp(prelesionseq,bpprelesion{ij});
            end
        end
        if exist('postlesionseq','var') && exist('bppostlesion','var')
            if exist('lenv','var')
                ent(ij,2)=transitionentropy_bp(postlesionseq,bppostlesion{ij},lenv(ij));
                if ent(ij,2)==0
                    histdepall(ij,2)=0;
                else
                    histdepall(ij,2)=historydependence_bp(postlesionseq,bppostlesion{ij},lenv(ij));
                end
            else
                ent(ij,2)=transitionentropy_bp(postlesionseq,bppostlesion{ij});
                if ent(ij,2)==0
                    histdepall(ij,2)=0;
                else
                    histdepall(ij,2)=historydependence_bp(postlesionseq,bppostlesion{ij});
                end
            end
        end
    end

end 

function [te, overallte] = transentropy(rawmtx)
% transition entropy per branch point
% and overall transition entropy, weighted by frequency of syllable
% as in Katahira 2013
% just te per bp should work with prob matrix as well

sylcounts = sum(rawmtx,2); % sum of all columns in a row

probmtx = rawmtx./repmat(sylcounts,1,size(rawmtx,2));


te = nansum(-probmtx.*log2(probmtx),2);

overallte = -sum(nansum(probmtx.*log2(probmtx),2).*(sylcounts./sum(sylcounts)));
end

function [te]=transitionentropy_bp(seq,bp,len)
    % function for transition entropy for a given seq and branchpoint pair.
    % This function will then be used in ak_transent_postl.
    % can also choose to make this a separate function file
    if ~exist('len','var')
        len = length(bp);
    end
    try
        nextsylstr=seq(regexp(seq,bp)+len); %so that you can put a regexp
    catch
        try
            seq=seq(1:end-len);
            nextsylstr=seq(regexp(seq,bp)+len);
        catch
            nextsylstr=[];
        end
    end
    %so that you can add bp with
    % context of prev syl eg'ab' instead of 'b'
    if ~(length(nextsylstr)==0)
        alltrans=unique(nextsylstr);
        counts=zeros(1,length(alltrans));
        for i=1:length(alltrans)
            str=[bp,alltrans(i)];
            counts(i)=length(regexp(seq,str));
        end
        sylcounts=sum(counts);
        probmtx = counts./repmat(sylcounts,1,size(counts,2));
        te = sum(-probmtx.*log2(probmtx),2);
    else te=0;
    end
end

function [outelts, outfreq] = uniquestring(seq,thresh)
% get unique elements in a string above a certain threshold (e.g. 2% of all
% elements)

[a,~,c]=unique(seq);
chist = histc(c,unique(c));
chist = chist./(sum(chist));
outelts = a(chist>thresh./100);
outfreq = chist(chist>thresh./100);
end