/**
 * 
 */
package kr.ac.kaist.swrc.jhannanum.module.tagger;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Writer;
import java.util.StringTokenizer;

import kr.ac.kaist.swrc.jhannanum.module.Module;

/**
 * @author Sangwon Park (hudoni@world.kaist.ac.kr), CILab, SWRC, Kaist
 *
 */
public class HMMTagger implements Module {
	public static int MAXLINE = 10000;
	public static int TRUE = 1;
	public static int FALSE = 0;
	public static double SF = -4.60517018598809136803598290936873;		/* log 0.01 */
	public static String EOS = "eos";
	public static String BNK = "bnk";
	
	final static private String MODULE_NAME = "HMMTagger";

	class MNode {
		String mout_str;
		String wp_tag;
		double prob_wt;
		double prob;
		int backptr;		/* for viterbi */
		int sibling;
	}

	class WPhead {
		String word;
		int mnode;
	}

	public WPhead[] wp = null;
	public int wp_end = 0;

	public MNode[] mn = null;
	public int mn_end = 0;

	public ProbabilityDBM pwt_tf = null;
	public ProbabilityDBM ptt_tf = null;
	public ProbabilityDBM pph_tf = null;

	public String PWT_TDBM_FILE;	/* ¼   ܾ ߻Ȯ */
	public String PTT_TDBM_FILE;	/* ¼Ұ  Ȯ */
	public String PPH_TDBM_FILE;	/*   Ȯ */

	public int org_pos = 0;
	public String[] org_eojeol;

	public boolean view = false;

	public String wtag = null;

	final static double PCONSTANT = -20.0;
	final static double LAMBDA = 0.9;

	final static double Lambda1 = LAMBDA;
	final static double Lambda2 = 1.0 - LAMBDA;

	private BufferedReader in = null;
	private PrintWriter out = null;

	public void initialize(Reader in, Writer out, String configFile) throws Exception {
		wp = new WPhead[5000];
		for (int i = 0; i < 5000; i++) {
			wp[i] = new WPhead();
		}
		wp_end = 1;

		mn = new MNode[10000];
		for (int i = 0; i < 10000; i++) {
			mn[i] = new MNode();
		}
		mn_end = 1;

		org_eojeol = new String[5000];

		PWT_TDBM_FILE = "data/stat/PWT.pos";
		PTT_TDBM_FILE = "data/stat/PTT.pos";
		PPH_TDBM_FILE = "data/stat/Ptt.pos";

		pwt_tf = new ProbabilityDBM(PWT_TDBM_FILE);
		pph_tf = new ProbabilityDBM(PPH_TDBM_FILE);
		ptt_tf = new ProbabilityDBM(PTT_TDBM_FILE);

		if (in != null) {
			this.in = new BufferedReader(in);
		} else {
			this.in = null;
		}

		if (out != null) {
			this.out = new PrintWriter(out);
		} else {
			this.out = null;
		}
	}

	public void setReader(Reader in) {
		if (in != null) {
			this.in = new BufferedReader(in);
		} else {
			this.in = null;
		}
	}

	public void setWriter(Writer out) {
		if (out != null) {
			this.out = new PrintWriter(out);
		} else {
			this.out = null;
		}
	}

	public void run() throws Exception {
		String line = null;
		int flag_end;
		int state = 0;

		while ((line = in.readLine()) != null) {
			/* int is_end_wp; */

			if (line.length() == 0) {
				continue;
			}
			/*
				else if (strncmp(line, "<num", 4) == 0) {
					// Ư .. ̹ õ   
					// * Ϲ ʿ  κ̴. 
					gets(line);
					printf("%s\n", line);
					gets(line); //  ϳ а
					while(gets(line) != NULL) {
						if (line[0] == 0) break;
						else printf("%s\n", line);
					}
					printf("\n");
					continue;
				}
			 */
			else if (line.equals("BOS")) {
				continue;
			}
			else if (line.equals("EOS")) {
				//			else if( strcmp(line, "") == 0 ) {
				//				printf( "+EOS detected..\n" );
				end_sentence();

				/* ʱȭ ٽ  ش. */
				reset();

				org_pos = 0;

				continue;
			} else if (line.equals("EOF")) {
				out.write("EOF\n\n");
				out.flush();
				continue;
			}
			//			else if(line[0]=='<' && line[strlen(line)-1] =='>')
			else if(line.length() > 0)
			{
				//				printf( "something detected..\n" );

				int v = 0, prev_v = 0, w = 0;
				String now_tag;
				double probability;

				// swlee -  տ  ִ  &line[1] ְ ƴϸ ׳ line ̿Ѵ.!		
				//printf( "line[1]:%s\n", line );

				/* TODO: ̵ ܾ Ȯ Ǵ м  Ȱ  ó */
				switch (line.charAt(0)) {
				case ' ':
					break;
				case '@':
					break;
				case '#':
					break;
				}

				org_eojeol[org_pos++] = line.substring(1);

				//printf("line : [%d]=%s\n", org_pos-1, org_eojeol[org_pos-1]);

				/*⿡ head  */
				w=new_wp(line.substring(1));
				/* is_end_wp = FALSE;
				 */
				//				sp=&wp[w].mnode;
				boolean is_first = true;
				while(true)
				{
					if((line = in.readLine())==null)
						break;
					if(line.length()==0)
						break;

					/*	make sblings */
					//	#ifdef DEBUG
					//					fprintf(stderr,"line%d : %s %d\n",__LINE__,line+1,mn_end);
					//					fflush(stderr);
					//	#endif

					// +1  ó tabڷ ϱ 
					line = line.substring(1);
					now_tag = phrase_tag(line);

					/*
						if (strcmp(now_tag, "SF") == 0)
							is_end_wp = TRUE;
							  ΰ? */
					probability = compute_wt(line);

					//	#ifdef DEBUG
					//					fprintf(stderr,"ss : %s %d\n",line+1,mn_end);
					//					fflush(stderr);
					//	#endif

					v = new_mnode(line, now_tag, probability);
					if (is_first) {
						wp[w].mnode = v;
						is_first = false;
						prev_v = v;
					} else {
						mn[prev_v].sibling = v;
						prev_v = v;
					}

					//	#ifdef DEBUG
					//					printf("%s %d\n",line+1,mn_end);
					//					fflush(stdout);
					//	#endif
				}
			}
		}

		//		tdbm_close(ptt_tf);
		//		tdbm_close(pwt_tf);
		//		tdbm_close(pph_tf);
	}

	public void shutdown() {

	}
	
	public String getName() {
		return MODULE_NAME;
	}

	private void reset() {
		wp_end = 1;
		mn_end = 1;
	}

	public int new_wp(String str) {
		wp[wp_end].word = str;
		wp[wp_end].mnode = 0;
		return wp_end++;
	}

	public int new_mnode(String str, String wp_tag, double prob)
	{
		mn[mn_end].mout_str = str;
		mn[mn_end].wp_tag = wp_tag;
		mn[mn_end].prob_wt = prob;
		mn[mn_end].backptr = 0;
		mn[mn_end].sibling = 0;
		return mn_end++;
	}

	public void update_prob_score(int from, int to)
	{
		short[] data = new short[256];
		String TTS = null;
		short[] wtS = new short[256];
		double PTT, PWT;
		double P;
		int x;

		/* TODO */
		PTT = SF;

		//		/*   Ȯ P(T_i,T_i-1)   */
		//		String.format("----%s-%s", mn[from].wp_tag, mn[to].wp_tag);
		//		sprintf(TTS,"----%s-%s",mn[from].wp_tag,mn[to].wp_tag);
		//		if (tdbm_fetch(pph_tf, TTS, data)>0) {
		//			PTT = atof((const char*)data);
		//		}
		//		else PTT = SF;
		//			/* 0.01 ڿα  	 Smoothing Factor */
		//
		//		/*  Ȯ P(T_i) ش.  αװ̴ϱ 
		//		 * 	ִ  ٷ.*/
		//		if (tdbm_fetch(pph_tf, mn[to].wp_tag, data)>0) {
		//			PTT -= atof((const char*)data);
		//		}
		//		else {
		//		// jjaeh
		//		//fprintf(stderr, "̻ ... P(%s) ʴ´ٴ..", mn[from].wp_tag);
		//		}
		//
		//
		//		/*  Ȯ P(T_i-1) ش.  αװ̴ϱ 
		//		 * 	ִ  ٷ.*/
		//		if (tdbm_fetch(pph_tf, mn[from].wp_tag, data)>0) {
		//			PTT -= atof((const char*)data);
		//		}
		//		else {
		//		// jjaeh
		//		//fprintf(stderr, "̻ ... P(%s) ʴ´ٴ..", mn[from].wp_tag);
		//		}


		if (mn[from].backptr == 0) {
			mn[from].prob = mn[from].prob_wt;
		}

		P = mn[from].prob + PTT + mn[to].prob_wt;

		// ߰  Ȯ ְ 
		if (view) {
			System.out.format("%s(%d:%s) : %f -> %f -> %s(%d:%s) : %f\n", mn[from].mout_str, 
					from, mn[from].wp_tag, mn[from].prob, PTT, 
					mn[to].mout_str, to, mn[to].wp_tag, mn[to].prob_wt );
		}

		/* տ Ȯ * Ȯ *  Ȯ
		 * PTT = P(T_i|T_i-1) / P(T_i)
		 * mn[to].prob_wt = P(T_i, W_i)
		 */
		if (mn[to].backptr == 0 || P > mn[to].prob) {
			mn[to].backptr = from;
			mn[to].prob = P; 
		}
	}

	public String phrase_tag(String str) {

		int p, q;
		String words = null;
		String morph = null;
		String tag = null;
		String tags = null;
		String m;

		/* ܾ ִ  ¼Ҹ   ±
		 *   θ.
		 */

		StringTokenizer st = new StringTokenizer(str, " \t");

		if (st.hasMoreTokens() == false) {
			System.err.println("HMMTagger: phrase_tag: wrong parameter");
		}

		words = st.nextToken();
		p = 0;
		tags = "";

		int flag = 0;

		for (p = 0; p < words.length(); p++) {
			if (words.charAt(p) == '\\') {
				p++;
			} else if (words.charAt(p) == '+') {
				flag = 0;
				tags += ' ';
			} else if (words.charAt(p) == '/') {
				flag = 1;
			} else if (flag != 0) {
				tags += words.charAt(p);
				flag++;
				if (flag > 10) {
					flag=0;
				}
			}
		}

		wtag = PhraseTag.phtag(tags);

		return wtag;
	}

	public void end_sentence()
	{
		final String sep = "+";

		String out_j_string;
		String out_string;
		String temp_string;
		int i, j, k, t, a;
		int p;
		String temp;
		String temp1;
		String data;
		String data1;

		// jjaeh
		String[] outStack = new String[5000];
		int pos = 0;
		int strNo = 0;

		/*
		TDBM *tf;
		FILE *fpOut;
		fpOut = fopen("./test.txt","w");
		tf = tdbm_open("./KJDic",'r');
		 */
		/*  带   Ѵ.*/
		i = new_wp(" ");
		wp[i].mnode = new_mnode(" ", "SF", 0);

		/*	ͺ .
		 */
		for(i=1;i<wp_end-1;i++)
		{
			for(j=wp[i].mnode;j!=0;j=mn[j].sibling)
				for(k=wp[i+1].mnode;k!=0;k=mn[k].sibling)
				{
					update_prob_score(j,k);
				}
		}

		//  Ѵ. 

		// by jjaeh - out_string   ϸ full .
		/*
		out_string[0] = 0;
		for(k=wp[i].mnode;k!=0;k=mn[k].backptr)
		{
			strcpy(temp_string, out_string);
			printf(":%s\n", mn[k].mout_str);
			//sprintf(out_string, "%s\n%s", mn[k].mout_str, temp_string);
		}
		printf("%s", out_string);
		 */

		// jjaeh -  迭 
		for(k=wp[i].mnode;k!=0;k=mn[k].backptr)
		{
			outStack[pos++] = mn[k].mout_str;
			//printf("%d:%s\n",pos-1,outStack[pos-1]);
			//sprintf(outStack[pos++],"%s", mn[k].mout_str);
			//sprintf(out_string, "%s\n%s", mn[k].mout_str, temp_string);
		}

		pos-=2;	
		for(k=pos;pos>-1;pos--)
		{
			//			strNo = org_eojeol[k-pos].length();

			//			if( org_eojeol[k-pos][strNo-1] == '\/') {
			//				org_eojeol[k-pos][strNo-1] = '\0';
			//			}
			// swlee -  /  鵵 ־ ׳ غ.

			out.format("%s\n%s\n\n", org_eojeol[k-pos], outStack[pos+1]);
			out.flush();
		}
		out.write('\n');
		out.flush();

		/*
		out_j_string[0] = 0;
		strcpy(data1, "");
		for(k=wp[i].mnode;k!=0;k=mn[k].backptr)
		{
		     strcpy(temp_string, out_j_string);
		     strcpy(temp1, mn[k].mout_str);

		     for(t=0;;t++){
		     	if( t == 0){
			   p = strtok(temp1, sep);
			   if(p == NULL) break;
			   strcpy(temp, p); 
			   p = strchr(temp, '//');

			   if(p == NULL){ printf("%s : error\n", temp); break;}

			   p[0] = '\0';
			   for(a = 0 ;;){
			      if(temp[a] == '\t')
			      	strcpy(temp, temp+1);
			      else break;
			      a++;
			   }
			  // printf("Code : %x\n", (int)temp[0]);
			  // printf("Word : %s\n", temp);
			   j = tdbm_fetch(tf, temp, data);
			   if(j > 0){
			 //  	printf("뿪 -> %s : %s\n", temp, data);
			  	strcat(data1, data); 
			   }
			   else {
			   	strcat(data1, temp);
			   }
			}
		     	else {
	                   p = strtok(NULL, sep);
			   if(p == NULL) break;
			   strcpy(temp, p);
			   p = strchr(temp, '//');
			   if(p == NULL){ printf("%s : error\n", temp); break;}
			   p[0] = '\0';
			   for(a=0;;){
			   	if(temp[a] == '\t'){
				  strcpy(temp, temp+1);
				  printf("test");
				}
			  	else break;
				  a++;
			   }
			  // printf("Code : %x\n", (int)temp[0]);
			  // printf("Word : %s\n", temp);
			   j = tdbm_fetch(tf, temp, data);
		           if(j > 0){
			   //	printf("뿪 -> %s : %s\n", temp, data);
			   	strcat(data1, "+");
		                strcat(data1, data);
		           }
		           else {
			   	//printf("%s : %s\n", temp, data);
			   	strcat(data1, "+");
		                strcat(data1, temp);
		           }

			}
		     }

		     strcpy(temp1, data1);
		     printf(": %s\n뿪:%s", mn[k].mout_str, temp1);
		     sprintf(out_j_string, "%s\n%s", temp1, temp_string);
		}
		printf("%s", out_j_string);
		tdbm_close(tf);
		fclose(fpOut);
		 */
	}
	//
	//	get_dict()
	//	{
	//		FILE *cfg;
	//		char _buf[1024],list[1024],fn[1024];
	//		int a=0;
	//
	//		cfg=fopen("tagger.cfg","r");
	//
	//		while(fgets(_buf,1024,cfg)!=NULL)
	//		{
	//			a++;
	//			sscanf(_buf,"%s %s",list,fn);
	//			if (*list=='#')	// remark
	//				continue;
	//			else
	//			{
	//				if (strcmp(list,"PWT")==0)
	//					strcpy(PWT_TDBM_FILE,fn);
	//				else if (strcmp(list,"PTT")==0)
	//					strcpy(PTT_TDBM_FILE,fn);
	//				else if (strcmp(list,"PPH")==0)
	//					strcpy(PPH_TDBM_FILE,fn);
	//				else fprintf(stderr,"error in line %d of tagger.cfg\n",a);
	//			}
	//		}
	//	}
	//				

	public double compute_wt(String str)
	{
		/* P(T_i, W_i) Ȯ Ѵ.
		 * T_i ⼭ ʿ.
		 */

		double current=0.0,tbigram,tunigram,lexicon;
		// by jjaeh -  ̵Ͼ ִ.
		//char morphtag[128],*tmp,probstr[128];
		String morphtag;
		int tmp;

		String probstr;
		String tag;
		String bitag;
		String oldtag;

		int pos;

		//printf("str:%s\n", str);

		/* ù° ¼ Ȯ  */
		//tmp=front_token(morphtag,str,'+');  /* morphtag = "б/ncn" */
		//get_tag(morphtag,tag);				/* tag="ncn" */ 


		// by jjaeh	-- \/ \+  skip ؾ 
		pos = 1; 
		while(pos<str.length() && !(str.charAt(pos-1)!='\\' && str.charAt(pos)=='+'))
			pos++;
		morphtag = str.substring(0, pos);

		//printf("morphtag : %s\n",morphtag);

		if(pos>=str.length()) {
			pos = str.length()-1;
		}
		tmp = pos+1;
		//printf("tmp : %s\n",tmp);

		pos = morphtag.length()-1; 
		while(morphtag.charAt(pos)!='/')
			pos--;
		tag = morphtag.substring(pos+1);



		//printf("tag : %s\n",tag);


		/*	fprintf(stderr,"[%s]-tmp:[%s]-tag:[%s]\n",morphtag,tmp,tag); */

		/* P(t1|t0) Ȯ  */
		bitag = "bnk-" + tag;				/* bnk_ncn */

		double[] prob = null;

		if ((prob = ptt_tf.get(bitag)) != null) {
			/* current = P(t1|t0) */
			tbigram = prob[0];
		}
		else {
			/* current = P(t1|t0)=0.01 */
			tbigram=PCONSTANT;
		}

		/* P(t1) Ȯ  : interpolation ϱ Ͽ */
		if ((prob = ptt_tf.get(tag)) != null) {
			/* current = P(t1) */
			tunigram = prob[0];
		} else { 
			/* current = P(t1)=0.01 */
			tunigram = PCONSTANT;
		}

		/* P(w|t) Ȯ  */
		if ((prob = pwt_tf.get(morphtag)) != null) {
			/* current *= P(w|t1) */
			lexicon = prob[0];
		}
		else {
			lexicon = PCONSTANT;
		}

		/*                              
		 *  Ȯ = P(w|t1) * P(t1|t0)
		 *                               Lambda1       Lambda2
		 *          ~= P(w|t1) *(P(t1|t0)       * P(t1)
		 *          (, Lambda1 + Lambda2 = 1) 
				current= lexicon + Lambda1*tbigram + Lambda2*tunigram;
		 */

		/*  Ȯ = P(w|t1)/P(t1) * P(t1|t0)/P(t1)
				current = lexicon - tunigram + tbigram - tunigram;
		 */

		current = lexicon + tbigram ;
		/*  Ȯ = P(w|t1) * P(t1|t0)
		 */
		oldtag = tag;


		while(tmp < str.length())
		{
			//	tmp=front_token(morphtag,tmp,'+');
			//	get_tag(morphtag,tag);

			//printf("in tmp : %s\n",tmp);
			pos = 1; 
			while (pos < (str.length() - tmp) && !(str.charAt(tmp+pos-1) != '\\' && str.charAt(tmp+pos) == '+')) {
				pos++;
			}
			morphtag = str.substring(tmp, tmp + pos);

			if(pos>=(str.length() - tmp)) {
				pos = str.length() - tmp - 1;
			}
			tmp = tmp+pos+1;

			pos = morphtag.length() - 1;

			while (morphtag.charAt(pos) != '/')
				pos--;

			tag = morphtag.substring(pos+1);

			//printf("result : %s&%s\n",morphtag, tag);


			/*	fprintf(stderr,"[%s]-tmp:[%s]-tag:[%s]\n",morphtag,tmp,tag); */

			/* P(t_i|t_i-1) Ȯ(bigram)  */
			bitag = oldtag + "-" + tag;				/* bnk_ncn */

			if ((prob = ptt_tf.get(bitag)) != null) {
				tbigram = prob[0];
			} else { 
				tbigram=PCONSTANT;
			}
			/*fprintf(stderr,"P(%s)=%s : %lf\n",bitag,probstr,tbigram);

					/* P(w|t) Ȯ  */
			if ((prob = pwt_tf.get(morphtag)) != null)
				/* current *= P(w|t) */
				lexicon = prob[0];
			else
				lexicon = PCONSTANT; 

			/* P(t) Ȯ  */
			if ((prob = ptt_tf.get(tag)) != null)
				/* current = P(t) */
				tunigram = prob[0];
			else 
				/* current = P(t)=0.01 */
				tunigram = PCONSTANT;

			/*
			 *  Ȯİ 
					current+= lexicon-tunigram + tbigram-tunigram;
			 */
			current += lexicon + tbigram;

			oldtag = tag;
		}

		/*  鿡 ؼ */
		bitag = "-bnk";

		/* P(bnk|t_last) */
		if ((prob = ptt_tf.get(bitag)) != null)
			tbigram = prob[0];
		else 
			tbigram = PCONSTANT;

		/* P(bnk) Ȯ ,  ȭ ų  .*/
		if ((prob = ptt_tf.get("bnk")) != null)
			/* current = P(bnk) */
			tunigram = prob[0];
		else 
			tunigram=PCONSTANT;

		/* P(w|bnk) = 1, ln 0 ȴ. */

		/* ʱ Ǽ
				current+= 0 -tunigram + tbigram-tunigram; */

		current+= 0 + tbigram;

		return current;
	}
}

