Death by Tensor

January 18, 2012

Of all the objects in Math that might kill a man, tensors have surely been lethal on more than one occasion. Sure, it’s not a traditional way to check out, but I’ll bet it happens.

I don’t see a whole lot out there in the gnu space for manipulating tensors; there’s something for matlab from sandia with a scary looking license. When I get to production C/C++, there is a beautiful package put together by Landry at Utah, using expression templates. However nothing seemed to fit, when initially coding up Arghavani et al., integration algorithm for shape memory alloys, in octave. Most of the terse theory was relatively easy to code up, I used a block matrix representation for the Cartesian tensors, ostensibly of rank 2,4 and 6 eg., indices for rank 6 look like :

which in theory has good cache/locality properties; indices are increasingly fast (ie., smaller period) as one moves to smaller tiles. There are well known approaches for contraction involving indices occurring twice in expressions ie., using traditional matrix multiplication. I found it easiest to work in the space of the resultant, eg., rank2 -> rank4 before an operation involving a rank4 object etc. All was going well till I came to equation A9, specifically :

which, as you can see if you squint hard enough dear reader, involves contraction over multiple variables, and worse, concerns more than two elements. After my eyes quit bleeding, I resolved to stick with the block matrix approach; loops are far too slow when scripting, regardless of representation. I introduced a third dimension for summation, and moved tiles around to suit the contraction at a given step. I used a number of temporary variables for ease of reading the code, a misnomer if ever I saw/wrote one. Anyhoo, the result overall was code which runs ~ 20 times faster than loops, and written in terms of higher level objects, making the task of ultimately writing C++ much easier. IN theory.

a = rand(3,3);
d = rand(3,3);
b = kron(eye(3,3),rand(3,3));
c = kron(eye(3,3),rand(3,3));
f = eye(3,3);

bb = rand(3,3,3,3);
cc = rand(3,3,3,3);

%brief ndim test
tic

%contract m
for i=1:3
   for n=1:3
       for r=1:3
            for s=1:3
                for p=1:3
                    for q=1:3
                        first(i,n,r,s,p,q) = a(i,1) * bb(1,n,p,q) * cc(r,1,p,q) * d(1,s) + \
                            a(i,2) * bb(2,n,p,q) * cc(r,2,p,q) * d(2,s) + \
                            a(i,3) * bb(3,n,p,q) * cc(r,3,p,q) * d(3,s);

                    end
                end
            end
        end
    end
end

%contract n
for i=1:3
    for j=1:3
        for r=1:3
            for s=1:3
                for p=1:3
                    for q=1:3
                        second(i,j,r,s,p,q) = a(1,j) * first(i,1,r,s,p,q) + \
                            a(2,j) * first(i,2,r,s,p,q) + \
                            a(3,j) * first(i,3,r,s,p,q);
                    end
                end
            end
        end
    end
end

disp(['time brief ndim : ',num2str(toc)]);

%block matrix/loops test

tic
testa=zeros(27,27);
%contract m
for i=1:3
    for n=1:3
        for r=1:3
            for s=1:3
                for p=1:3
                    for q=1:3
                        testa((i-1)*3*3+(r-1)*3+p,(n-1)*3*3+(s-1)*3+q) = \
                            a(i,1) * b((1-1)*3+p,(n-1)*3+q) * c((r-1)*3+p,(1-1)*3+q) * d(1,s) + \
                            a(i,2) * b((2-1)*3+p,(n-1)*3+q) * c((r-1)*3+p,(2-1)*3+q) * d(2,s) + \
                            a(i,3) * b((3-1)*3+p,(n-1)*3+q) * c((r-1)*3+p,(3-1)*3+q) * d(3,s);
                    end
                end
            end
        end
    end
end
%contract n
for i=1:3
    for j=1:3
        for r=1:3
            for s=1:3
                for p=1:3
                    for q=1:3
                        testb((i-1)*3*3+(r-1)*3+p,(j-1)*3*3+(s-1)*3+q) = \
                            a(1,j) * testa((i-1)*3*3+(r-1)*3+p,(1-1)*3*3+(s-1)*3+q) + \
                            a(2,j) * testa((i-1)*3*3+(r-1)*3+p,(2-1)*3*3+(s-1)*3+q) + \
                            a(3,j) * testa((i-1)*3*3+(r-1)*3+p,(3-1)*3*3+(s-1)*3+q);
                    end    
                end
            end
        end
    end
end
%contract (formerly) r; new index p w/ 'a' row index
for i=1:3
    for j=1:3
        for s=1:3
            for p=1:3
                for r=1:3
                    for q=1:3
                        testc((i-1)*3*3+(p-1)*3+r,(j-1)*3*3+(s-1)*3+q) = \
                            a(p,1) * testb((i-1)*3*3+(1-1)*3+r,(j-1)*3*3+(s-1)*3+q) + \
                            a(p,2) * testb((i-1)*3*3+(2-1)*3+r,(j-1)*3*3+(s-1)*3+q) + \
                            a(p,3) * testb((i-1)*3*3+(3-1)*3+r,(j-1)*3*3+(s-1)*3+q);

                    end
                end
            end    
        end
    end
end

%contract (formerly) s; new index q w/ 'a' column index
for i=1:3
    for j=1:3
        for s=1:3
            for p=1:3
                for r=1:3
                    for q=1:3
                        testd((i-1)*3*3+(p-1)*3+r,(j-1)*3*3+(q-1)*3+s) = \
                            a(1,q) * testc((i-1)*3*3+(p-1)*3+r,(j-1)*3*3+(1-1)*3+s) + \
                            a(2,q) * testc((i-1)*3*3+(p-1)*3+r,(j-1)*3*3+(2-1)*3+s) + \
                            a(3,q) * testc((i-1)*3*3+(p-1)*3+r,(j-1)*3*3+(3-1)*3+s);

                    end
                end
            end    
        end
    end
end

%create bit in brackets
temp = a*d - (1/3).* f.* sum(sum(a.*d));

%distribute
for i=1:3
    for j=1:3
        for s=1:3
            for p=1:3
                for r=1:3
                    for q=1:3
                        teste((i-1)*3*3+(s-1)*3+r,(j-1)*3*3+(p-1)*3+q) = \
                            testd((i-1)*3*3+(s-1)*3+r,(j-1)*3*3+(p-1)*3+q) * temp(r,q);

                    end
                end
            end    
        end
    end
end

testf=zeros(3,27);

%contract p
for i=1:3
    for j=1:3
        for s=1:3
            for p=1:3
                for r=1:3
                    for q=1:3
                        testf(i,(j-1)*3*3+(s-1)*3+p) += \
                            teste((i-1)*3*3+(r-1)*3+q,(j-1)*3*3+(s-1)*3+p);

                    end
                end
            end    
        end
    end
end

testg=zeros(3,3);
%contract q
for i=1:3
    for j=1:3
        for r=1:3
            for q=1:3
                testg(i,j) += \
                    testf(i,(j-1)*3*3+(r-1)*3+q);

            end    
        end
    end
end

disp(['time separate loops :',num2str(toc)]);

tic

% expand 2nd  rank tensor U_im
%put m tiles in 3rd d
%i index has period 27 w/ matrix row index

foo(:,:,1)=kron(a(:,1),ones(9,27));
foo(:,:,2)=kron(a(:,2),ones(9,27));
foo(:,:,3)=kron(a(:,3),ones(9,27)); 

% expand 4th rank tensor G_mnpq
%put m tiles in 3rd d
%n index has period 27 w/ matrix column index
%p index has period 3 w/ block matrix row index
%q index has period 3 w/ block matrix column index

foo2(:,:,1)=[repmat(b(1:3,1:3),9,3), repmat(b(1:3,4:6),9,3), repmat(b(1:3,7:9),9,3)];
foo2(:,:,2)=[repmat(b(4:6,1:3),9,3), repmat(b(4:6,4:6),9,3), repmat(b(4:6,7:9),9,3)];
foo2(:,:,3)=[repmat(b(7:9,1:3),9,3), repmat(b(7:9,4:6),9,3), repmat(b(7:9,7:9),9,3)]; 

%expand 4th rank tensor Z_rmpq
%put m tiles in 3rd d
%r index has period 9 w/ block matrix row index
%p index has period 3 w/ block matrix row index
%q index has period 3 w/ block matrix column index

foo3(:,:,1)=repmat([repmat(c(1:3,1:3),1,9); repmat(c(4:6,1:3),1,9); repmat(c(7:9,1:3),1,9)],3,1);
foo3(:,:,2)=repmat([repmat(c(1:3,4:6),1,9); repmat(c(4:6,4:6),1,9); repmat(c(7:9,4:6),1,9)],3,1);
foo3(:,:,3)=repmat([repmat(c(1:3,7:9),1,9); repmat(c(4:6,7:9),1,9); repmat(c(7:9,7:9),1,9)],3,1); 

%expand 2nd rank tensor C_ms
%put m tiles in 3rd d
%s index has period 3 w/ block matrix column index

foo4(:,:,1)=repmat(kron(d(1,:),ones(27,3)),1,3);
foo4(:,:,2)=repmat(kron(d(2,:),ones(27,3)),1,3);
foo4(:,:,3)=repmat(kron(d(3,:),ones(27,3)),1,3);

%CONTRACT m; U_im * G_mnpq * Z_rmpq * C_ms ->  F_inrspq

F = sum(foo.*foo2.*foo3.*foo4,3);

%clear foo foo2 foo3 foo4

%reshape F_inrspq to suit contraction w/ n
%put n tiles in 3rd d

foo5(:,:,1) = F(:,1:9);
foo5(:,:,2) = F(:,10:18);
foo5(:,:,3) = F(:,19:27); 

%CONTRACT n; U_nj * F_inrspq = G_ijrspq

G = [(foo5(:,:,1).*a(1,1) + foo5(:,:,2).*a(2,1) + foo5(:,:,3).*a(3,1)), \    %j==1 tile
    (foo5(:,:,1).*a(1,2) + foo5(:,:,2).*a(2,2) + foo5(:,:,3).*a(3,2)), \    %j==2 tile
    (foo5(:,:,1).*a(1,3) + foo5(:,:,2).*a(2,3) + foo5(:,:,3).*a(3,3)) ];    %j==3 tile
%reshape G_ijrspq to suit contraction w/ r
%put r tiles in 3rd d; r has period 9 w/ row index

foo6(:,:,1) = G(1:3,:);            %r==1 tiles
foo6(:,:,2) = G(4:6,:);            %r==2 tiles
foo6(:,:,3) = G(7:9,:);            %r==3 tiles
foo6(:,:,4) = G(10:12,:);        %r==1 tiles
foo6(:,:,5) = G(13:15,:);        %r==2 tiles
foo6(:,:,6) = G(16:18,:);        %r==3 tiles
foo6(:,:,7) = G(19:21,:);        %r==1 tiles
foo6(:,:,8) = G(22:24,:);        %r==2 tiles
foo6(:,:,9) = G(25:27,:);        %r==3 tiles

%CONTRACT r; U_pr * G_ijrspq = H_ijpspq

H = [(foo6(:,:,1).*a(1,1) + foo6(:,:,2).*a(1,2) + foo6(:,:,3).*a(1,3)); \        %p==1 tiles
    (foo6(:,:,1).*a(2,1) + foo6(:,:,2).*a(2,2) + foo6(:,:,3).*a(2,3)); \        %p==2 tiles
    (foo6(:,:,1).*a(3,1) + foo6(:,:,2).*a(3,2) + foo6(:,:,3).*a(3,3)); \        %p==3 tiles
    (foo6(:,:,4).*a(1,1) + foo6(:,:,5).*a(1,2) + foo6(:,:,6).*a(1,3)); \        %p==1 tiles
    (foo6(:,:,4).*a(2,1) + foo6(:,:,5).*a(2,2) + foo6(:,:,6).*a(2,3)); \        %p==2 tiles
    (foo6(:,:,4).*a(3,1) + foo6(:,:,5).*a(3,2) + foo6(:,:,6).*a(3,3)); \        %p==3 tiles
    (foo6(:,:,7).*a(1,1) + foo6(:,:,8).*a(1,2) + foo6(:,:,9).*a(1,3)); \        %p==1 tiles
    (foo6(:,:,7).*a(2,1) + foo6(:,:,8).*a(2,2) + foo6(:,:,9).*a(2,3)); \        %p==2 tiles
    (foo6(:,:,7).*a(3,1) + foo6(:,:,8).*a(3,2) + foo6(:,:,9).*a(3,3))]; \        %p==3 tiles

%reshape H_ijpspq to suit contraction w/ s
%put s tiles in 3rd d; s has period 9 w/ row index

foo7(:,:,1) = H(:,1:3);            %s==1 tiles
foo7(:,:,2) = H(:,4:6);            %s==2 tiles
foo7(:,:,3) = H(:,7:9);            %s==3 tiles
foo7(:,:,4) = H(:,10:12);        %s==1 tiles
foo7(:,:,5) = H(:,13:15);        %s==2 tiles
foo7(:,:,6) = H(:,16:18);        %s==3 tiles
foo7(:,:,7) = H(:,19:21);        %s==1 tiles
foo7(:,:,8) = H(:,22:24);        %s==2 tiles
foo7(:,:,9) = H(:,25:27);        %s==3 tiles

%CONTRACT s; U_sq * H_ijpspq = I_ijpqpq

I = [(foo7(:,:,1).*a(1,1) + foo7(:,:,2).*a(2,1) + foo7(:,:,3).*a(3,1)), \        %q==1 tiles
    (foo7(:,:,1).*a(1,2) + foo7(:,:,2).*a(2,2) + foo7(:,:,3).*a(3,2)), \        %q==2 tiles
    (foo7(:,:,1).*a(1,3) + foo7(:,:,2).*a(2,3) + foo7(:,:,3).*a(3,3)), \        %q==3 tiles
    (foo7(:,:,4).*a(1,1) + foo7(:,:,5).*a(2,1) + foo7(:,:,6).*a(3,1)), \        %q==1 tiles
    (foo7(:,:,4).*a(1,2) + foo7(:,:,5).*a(2,2) + foo7(:,:,6).*a(3,2)), \        %q==2 tiles
    (foo7(:,:,4).*a(1,3) + foo7(:,:,5).*a(2,3) + foo7(:,:,6).*a(3,3)), \        %q==3 tiles
    (foo7(:,:,7).*a(1,1) + foo7(:,:,8).*a(2,1) + foo7(:,:,9).*a(3,1)), \        %q==1 tiles
    (foo7(:,:,7).*a(1,2) + foo7(:,:,8).*a(2,2) + foo7(:,:,9).*a(3,2)), \        %q==2 tiles
    (foo7(:,:,7).*a(1,3) + foo7(:,:,8).*a(2,3) + foo7(:,:,9).*a(3,3))]; \        %q==3 tiles

%CONTRACT p; I_ijpqpq = J_ijqq
%CONTRACT q; J_ijqq = output_ij

foo8 = kron(ones(9,9),temp).*I;
J = [sum(foo8(1:9,:)); sum(foo8(10:18,:)); sum(foo8(19:27,:))];

output = [sum(J(:,1:9),2), sum(J(:,10:18),2), sum(J(:,19:27),2)];

disp(['time vector : ',num2str(toc)]);
disp(['error btwn loops and vector apprch :',num2str(sum(sum(output-testg)))]);

HPC Essentials III

December 27, 2011

Third lecture on Message Passing Interface, procfs, Amdahl’s law, inter-process communication and more


band of skulls

December 15, 2011

sheer genius, I can’t believe I waited this long to pick up the album !?!


Quantum Espresso Quick Start

December 1, 2011

A brief guide for the clusters at PSU, just enough to make you dangerous :)


HPC Essentials I & II

November 30, 2011

First two lectures from the short course at PSU; UNIX/C overview, performance, POSIX threads, OpenMP, Bash scripting, affinity, architecture


tidy profile data

August 26, 2011

here’s a bash/octave script for tidying up the aforementioned profiling output. The script pulls relevant meta data, collecting time and frequency for same call(s) and producing a neat, sorted *csv which can be used for generating nice graphs, here again an example from the lovely quantum espresso





 #!/bin/bash
 #cleanup and sort profile data
gawk '($1~"meta"){print $0}' | \
sed 's/\ //g' | \
sed 's/meta//g' |\
sed 's/<//g'|\
sed 's/\///g'|\
sed 's/>//g'|\
tee prof_data_raw.txt |\
gawk 'BEGIN{FS=":"}{print NR,",",$1,",",$2$3}' > foo.txt
gawk 'BEGIN{FS=":"}{print $4,",",$2,"#",$3,","}' < prof_data_raw.txt > names.txt 

cat > temp.m << %end 

load foo.txt; 

% collect the profile data for unique calls
% the call column 3 is a unique cat of call line + file number
[dat ind]=sort(foo(:,3));
 out = foo(ind,:);
%determine total unique call keys (col 3)
uniq = unique(out(:,3));
num = length(uniq); 

%output profile data matrix
profl = zeros(num,4); 

%collect call frequency information
for i=1:num
         ind = find(out(:,3)==uniq(i));
         %call frequency
         profl(i,2)=length(ind);
         %total call time
         profl(i,3)=sum(out(ind,2));
         %average call time
         profl(i,4)=profl(i,3)/profl(i,2);
         %line number
         ln = out(ind,1);
         profl(i,1) = ln(1);
end 

%sort by total time
[dat ind]=sort(profl(:,3));
out = profl(ind,:);
out=flipud(out);
fid=fopen('names.txt');
[a b]=fscanf(fid,'%s');
str = strsplit(a,",");
fclose(fid);
fid = fopen('profile_output.csv','w');  
fprintf(fid,"call, hash, total time (ms), average time (ms), frequency\n"); 

for i=1:num
         index = out(i,1);
         fprintf(fid,"%s,%s,%f,%f,%f\n",str{2*index-1},str{2*index},out(i,3),out(i,4),out(i,2));
end      

fclose(fid);
%end 

octave --silent --eval temp 

#rm temp.m prof_data_raw.txt foo.txt names.txt
exit 


warlocks

August 26, 2011

magnificent, how does this have less than 1k views after 2 years??


xml call tree

August 16, 2011

To be sure, there are many great profilers out there; each with different learning curves and costs ranging from free to a kidney. If you face a new distribution or one you would like to develop with, there is no substitute for rolling your own; follow this bash script, rebuild and filter the output and you have an xml call tree with times for calls, and pointers to source code and line number. This is applied to the fortran code(s) in quantum espresso, you can readily adapt to your own needs. Caveat: There are a variety of filter steps in the pipeline; taking into account fortran syntax and different programming styles is difficult :) Once you have your stdout filtered to proper xml, you can view in something like eclipse eg,. attached image. Be aware also that you can and will write very large files; I exclude many i/o and related functions, these you can poke with strace or similar anyway.


 #!/bin/bash

#set source and destination paths
PROFILE_DIR_PATH=~/scratch/espresso-PRACE/PW
PROFILE_BAK_PATH=~/scratch/PW_BAK

#set include/exclude strings; these i/o related function calls won't be timed later
EXCLUDE="!(\$0~\"bcast\")&&!(\$0~\"info\")&&!(\$0~\"clock\")&&!(\$0~\"report\")&&!(\$0~\"mp\")&&!(\$0~\"!\")&&!(\$0~\"error\")&&\
!(\$0~\"read\")&&!(\$0~\"write\")&&!(\$0~\"plot\")&&!(\$0~\"delete\")&&!(\$0~\"scan\")&&!(\$0~\"specific\")&&!(\$0~\"print\")&&!(\$0~\"format\")"
INCLUDE="(\$0~\"bcast\")||(\$0~\"info\")||(\$0~\"clock\")||(\$0~\"report\")||(\$0~\"mp\")||(\$0~\"!\")||(\$0~\"error\")||\
(\$0~\"read\")||(\$0~\"write\")||(\$0~\"plot\")||(\$0~\"delete\")||(\$0~\"scan\")||(\$0~\"specific\")||(\$0~\"print\")||(\$0~\"format\")"

#do once
#mkdir  -p $PROF_BAK_PATH
#cp -iu $PROF_DIR_PATH/* $PROF_BAK_PATH

declare -a file_list

#filenames to array
file_list=$(ls -l ${PROFILE_BAK_PATH} | gawk '/f90/ {print $9}')

cnt=1000;

#parse files & pretty up, inserting write statements and timers
for x in $file_list
do
    let "cnt+=1"
    sed 's/\,\&/\,\ \&/g' $PROFILE_BAK_PATH/$x | \
    sed 's/)/)\ /g' | \
    gawk 'BEGIN{IGNORECASE=1; FS="&"} (NF<=1) || ($0~"#") || !($1~"if") || ($0~"!") || '$INCLUDE' {print $0}; \
    (NF>1) && !($0~"#") && ($1~"if") && !($0~"!") && '$EXCLUDE' {printf("%s ",$1)}' |\
    gawk 'BEGIN{IGNORECASE=1;} /if/ { if (($0~"&") && !($0~"!") &&!($0~"#") \
    && !($NF~"&") && !($0~"error")) {gsub(/&/," ");print} else print $0}; \
    !/if/ {print $0}' |\
    gawk 'BEGIN{IGNORECASE=1; } (($1~"if")||($1~"else")) && (length($0)>100) \
    {{for (i=1;i<=NF;i++) printf("%s",$i);} printf("\n")} !(($1~"if")||($1~"else")) || (length($0)<=100) {print $0}' |\
    gawk 'BEGIN{IGNORECASE=1; FS="&"} (NF==1) || ($0~"#") || !($0~"call") || '$INCLUDE' {print $0}; \
    (NF>1) && !($0~"#") && ($0 ~ "call") && '$EXCLUDE' {printf("%s ",$1)}' |\
    gawk 'BEGIN{IGNORECASE=1; FS="&"} (NF==1) || ($0~"#") || !($0~"call") || '$INCLUDE' {print $0}; \
    (NF>1) && !($0~"#") && ($0 ~ "call") && '$EXCLUDE' {printf("%s ",$1)}' |\
    gawk 'BEGIN{IGNORECASE=1;} !/call/{print $0}; /call/ {for (i=1; i<=NF; i++) printf("%s",$i); printf("\n")}' | \
    sed 's/call/\ call\ /g' | \
    sed 's/CALL/\ call\ /g' |\
    sed 's/(/\ (/g' | \
    gawk 'BEGIN{IGNORECASE=1; ch=1;} !/call/ || '$INCLUDE' {print $0}; \
    /call/ && '$EXCLUDE' {pos=pos+1; print " call date_and_time(values=time_array_0) "; \
    for (i=1; i<=NF; i++) {if ($i=="call") ch=i+1;} ;\
    print " start_time = time_array_0(6)*60 + time_array_0(7) + 0.001*time_array_0(8) "; \
    print " write(*,*)\" <",$ch,">\" "; print $0; \
    print " call date_and_time(values=time_array_1) "; \
    print " finish_time=time_array_1(6)*60 + time_array_1(7) + 0.001*time_array_1(8) "; \
    print " write(*,*)\" <meta> \",1000*(finish_time-start_time),\" : ",NR," : ",'$cnt'," : ",$ch," </meta>\" " ;\
    print " write(*,*)\" <source> ~/espresso-PRACE/PW/'$x' </source> \" " ;  \
    print " write(*,*)\"</",$ch,">\" " ; }' | \
    gawk 'BEGIN{IGNORECASE=1; } {print $0;} /IMPLICIT NONE/{ print "INTEGER :: time_array_0(8), time_array_1(8)"; \
    print "REAL :: start_time, finish_time"; }' |\
    gawk 'BEGIN{FS=","} ($0~"call" && (NF <=5)) || ($0 ~ "!") || !/call/{print $0} \
    $0~"call" && (NF>5) && !($0 ~ "!") {print $1,",",$2,",",$3,",",$4,",&"; for (i=5;i<NF;i++) \
    printf("%s,",$i); printf("%s \n",$NF)}' > $PROFILE_DIR_PATH/$x
done

#copy in hard cases; don't bother
cp $PROFILE_BAK_PATH/input.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/symme.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/g_psi.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/pw2blip.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/stop_run.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/vcsmd.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/dynamics_module.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/paw_init.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/compute_becsum.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/h_epsi_her_*.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/wannier_*.f90 $PROFILE_DIR_PATH
cp $PROFILE_BAK_PATH/wfcinit.f90 $PROFILE_DIR_PATH

exit


vectorized octave

December 28, 2010

octave can be very fast when vectorized, particularly when reading and formatting streams. Here’s an example which takes an array read from stdin and reformats :




	%reapportion  bytes

	[m n]=size(input);

	if (en=='ieee-be')

	%7 4 byte numbers
	outputA = reshape(input(:,1:14),m,2,7);
	outputB = reshape((bitshift(outputA(:,1,:),16) + outputA(:,2,:)),m,7);

	%4 2 byte numbers
	%8 4 byte numbers
	outputA = reshape(input(:,19:34),m,2,8);
	outputC = reshape((bitshift(outputA(:,1,:),16) + outputA(:,2,:)),m,8);

	%2 2 byte numbers
	%4 4 byte numbers
	outputA = reshape(input(:,37:44),m,2,4);
	outputD = reshape((bitshift(outputA(:,1,:),16) + outputA(:,2,:)),m,4);

	else

	%7 4 byte numbers
	outputA = reshape(input(:,1:14),m,2,7);
	outputB = reshape((outputA(:,1,:) + bitshift(outputA(:,2,:),16)),m,7);

	%4 2 byte numbers
	%8 4 byte numbers
	outputA = reshape(input(:,19:34),m,2,8);
	outputC = reshape((outputA(:,1,:) + bitshift(outputA(:,2,:),16)),m,8);

	%2 2 byte numbers
	%4 4 byte numbers
	outputA = reshape(input(:,37:44),m,2,4);
	outputD = reshape((outputA(:,1,:) + bitshift(outputA(:,2,:),16)),m,4);

	end

	%must recast for signs
	output = [int32(outputB) int16(input(:,15:18)) int32(outputC) int16(input(:,35:36)) int32(outputD) int16(input(:,45:n))]; 	


battles

December 28, 2010

great prog rock, sounds very good very late in the evening


Follow

Get every new post delivered to your Inbox.