% extract the alternative start / MTS prediction data
% read data
Data = readtable("Table_S2_Secondary_screen_results_no_formulas.xlsx", 'Sheet', 'Data');
Mont = readtable("refs_data/Monteius.xlsx");
CGFP = readtable("refs_data/Brecker_2013.xlsx");
%% categorize things

Data.ORF = categorical(Data.ORF);
Data.Final_phenptype = categorical(Data.Final_phenptype);
Mont.GeneName = categorical(Mont.GeneName);
CGPF.ORF = categorical(CGFP.ORF);
CGFP.ControlLocalization = categorical(CGFP.ControlLocalization);
% also calculate absolute max prediction from all 3 algorithms
MontMaxPred = max([Mont.MitoFatesProbability Mont.TargetPProbability ...
    Mont.MitoProtProbability], [], 2);
%% Make a table based on CGFP data

% select only the ones that are localized somewhere and split into mito/non
% mito

CGFPmt = CGFP(CGFP.ControlLocalization == 'mitochondria' | ...
    CGFP.ControlLocalization == 'cytosol,mitochondria' | ...
    CGFP.ControlLocalization == 'nucleus,mitochondria',:);

CGFPnon = CGFP(CGFP.ControlLocalization ~= 'below threshold' & ...
    CGFP.ControlLocalization ~= 'mitochondria' & ...
    CGFP.ControlLocalization ~= 'cytosol,mitochondria' & ...
    CGFP.ControlLocalization ~= 'nucleus,mitochondria', :);
%% extract predictions and experimental data for each
Obs_mt = zeros([size(CGFPmt,1) 1]);

for n=1:size(CGFPmt,1)
    if ismember(CGFPmt.ORF(n), Data.ORF)
        if Data.Final_phenptype(Data.ORF == CGFPmt.ORF(n)) == 'OBSERVED'
            Obs_mt(n) = 1;
        end
    end
end

Obs_non = zeros([size(CGFPnon,1) 1]); 
MaxPr_non = zeros([size(CGFPnon,1) 1]);

for n=1:size(CGFPnon,1)
    % mt localization prediction for non-mito proteins
    AltS = Mont(Mont.GeneName == CGFPnon.ORF(n),:);
    if isempty(AltS)
        MaxPr_non(n) = -1;
    else
        MaxPr_non(n) = max([AltS.MitoFatesProbability(AltS.TISDistance==0) ...
           AltS.TargetPProbability(AltS.TISDistance==0) ...
           AltS.MitoProtProbability(AltS.TISDistance==0)]);
    end
    % observed or not in our screen
    if ismember(CGFPnon.ORF(n), Data.ORF)
        if Data.Final_phenptype(Data.ORF == CGFPnon.ORF(n)) == 'OBSERVED'
            Obs_non(n) = 1;
        end
    end
end

%% mito graph

% combine intensities and our data, remove NaNs
Mt_table = [CGFPmt.ControlMedian(~isnan(CGFPmt.ControlMedian)) ...
    Obs_mt(~isnan(CGFPmt.ControlMedian))];
Mt_table_s = sortrows(Mt_table, 1, 'descend');
Rnk = (1:size(Mt_table_s,1))';

figure

hold on
c = scatter(Rnk(Mt_table_s(:,2)==1), log(Mt_table_s(Mt_table_s(:,2)==1,1)), ...
    '|', 'SizeData', 40, 'MarkerEdgeColor',[1 0 0]);
o = scatter(Rnk(Mt_table_s(:,2)==0), log(Mt_table_s(Mt_table_s(:,2)==0,1)),...
    '|', 'SizeData', 40, 'MarkerEdgeColor',[0.5 0.5 0.5]);
%alpha(c, 0.5)
%alpha(o, 0,5)
%% non mito graph
thresh = 0.6;
Non_table_low = [CGFPnon.ControlMedian(~isnan(CGFPnon.ControlMedian) & (MaxPr_non<thresh) & (MaxPr_non>=0)) ...
    Obs_non(~isnan(CGFPnon.ControlMedian) & (MaxPr_non<thresh) & (MaxPr_non>=0))];
Non_table_high = [CGFPnon.ControlMedian(~isnan(CGFPnon.ControlMedian) & (MaxPr_non>=thresh)) ...
    Obs_non(~isnan(CGFPnon.ControlMedian) & (MaxPr_non>=thresh))];
Non_L = sortrows(Non_table_low, 1, 'descend');
Rnk_l = (1:size(Non_L,1))';
Non_H = sortrows(Non_table_high, 1, 'descend');
Rnk_h = (1:size(Non_H,1))';

figure
lnot = scatter(Rnk_l(Non_L(:,2)==0), log(Non_L(Non_L(:,2)==0,1)), ...
    '.', 'SizeData', 40, 'MarkerEdgeColor',[0.7 0.7 0.7]);
hold on
lobs = scatter(Rnk_l(Non_L(:,2)==1), log(Non_L(Non_L(:,2)==1,1)), ...
    'o', 'SizeData', 40, 'MarkerEdgeColor',[1 0 0]);

hnot = scatter(Rnk_h(Non_H(:,2)==0), log(Non_H(Non_H(:,2)==0,1)), ...
    '.', 'SizeData', 40, 'MarkerEdgeColor',[0.4 0.4 0.4]);
hobs = scatter(Rnk_h(Non_H(:,2)==1), log(Non_H(Non_H(:,2)==1,1)), ...
    'o', 'SizeData', 40, 'MarkerEdgeColor',[1 0 0]);
xlim([-50 2200])
ylabel('Log protein abundance, a.u.')
xlabel('Rank')