//  Getinf.java --  Retrieve information from a table on stateoftheair webpage
//                   Align this information and save to a ".csv" or comma delimited file.
//                   that can be read directly by Microsoft Excel
//                   One can change the delimiter to a tab for reading if needed.
//                       
// Arguments:      argument is a statename 
//                    default is state of alabama or pennsylvania if no state is given as argument ?    
//
// Output:         output is a file named "statenamealabama.csv" (if the argument given were "alabama")  
//
//
// uses:           htmlparser.jar
//                 This is a java-based parser for HTML -- i.e. HTMLParser.jar  Java Library 
//                 and is found at (obtained from) http://htmlparser.sourceforge.net  (D.Oswald)
// 
//
//  
//  written by bkrespan    
//  9/18/2009 
//
//
//
//
//

// package defaultpkge;

import java.io.FileWriter;
import java.io.IOException;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.nodes.*;
import org.htmlparser.util.*;
import org.htmlparser.parserapplications.filterbuilder.wrappers.AndFilterWrapper;
import org.htmlparser.beans.*;
import org.htmlparser.tags.*;
import org.htmlparser.*;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;


public class Getinf {

	   public static void main(String args[]) throws ParserException, IOException
	     {
 
	        String url = "http://stateoftheair.org/2009/states/";	        		
	        String fname = "statename";
	        String ext = ".csv";
	        String defaultst = "pennsylvania";
	        String findstrnl = "\n";
	        String findstr2 = "\r";
	        String findstrbr = "<br>";
	        String replacestrtab = "\t";
	        String replacestrnull = "";
	        String replacestrspace = " ";
	        String replacestrcomma = " , ";
//	        String delim = "\t";
	        String delim = " , ";	    
	        
	        int i,jjj, jj,j, n, nn;
	        int tdleng, noden, nrows, nodetdnum;
	        int numrows;
	        boolean done;
	        Node[] nodes = null;
	        Node nods, nodesubtr, nodesubth, nodesubtrtd;
	        String textrows[] = new String[250];
   	        String textline;
	        String nodetext;
	 String statess[] =  { "Alabama", "Alaska", "Arizona","Arkansas","California", 
			 "Connec, ticut","Colorado","Delaware","District of Columbia", 
			 "Florida","Georgia","Hawaii","Idaho","Illinois","Indiana", 
			 "Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland", 
			 "Massachusetts","Michigan","Minnesota","Mississippi","Missouri",
			 "Montana","Nebraska","Nevada","New Hampshire","New Jersey", 
			 "New Mexico","New York","North Carolina","North Dakota", 
			 "Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island", 
			 "South Carolina","South Dakota","Tennessee","Texas","Utah", 
			 "Vermont","Virginia","Washington","West Virginia","Wisconsin", 
			 "Wyoming" };
	 
	 
	 
	 
	         if (args.length > 1) 
	         {
	        System.out.println("args " + args[1]);
	        url = url + args[1];
	        fname = fname + args[1] +ext;
	         }
	         else
	         {
	        	 url = url + defaultst;
	        	 fname = fname + defaultst + ext;
	         }
	         
	        // could also get args.length -- statename is the argument
	       
	         
	         StringBean sb = new StringBean ();
	         sb.setLinks(false);
	         sb.setReplaceNonBreakingSpaces(true);
	         sb.setCollapse(true);
	         Parser parser = new Parser(url);
	         
         	         

	        
	NodeList listnodes = new NodeList();
	NodeList sublist = new NodeList();
	NodeList sublisttr = new NodeList();
	NodeList sublisttd = new NodeList();
	  NodeList subnodes  = new NodeList();
	  NodeList subnodesth = new NodeList();
	  NodeList subnodestd = new NodeList();

	   
        NodeFilter filter =
		new AndFilter (
	        new TagNameFilter ("table"),
	        new HasAttributeFilter ("class", "county-data"));
        
       	        			
        
        NodeFilter subfiltertr = 	new TagNameFilter("tr");
        NodeFilter subattribfilter = new HasAttributeFilter("class","totals");
        
        Attribute attribt = new Attribute("class","totals");

        NodeFilter subfilterth = new TagNameFilter("th");
	    NodeFilter subfiltertd = new TagNameFilter("td");
	    NodeFilter filterbr = new TagNameFilter("br");
  
	    
	   
	        try
	            {
	               listnodes = parser.extractAllNodesThatMatch(filter);
	            }
	            catch (ParserException e)
	            {
	                System.out.println("Stacktrace printout next");
	                e.printStackTrace ();
	             }

	      
	        catch (Exception e)
	        {
	            System.out.println(e.toString ());
	   	        }
	    
	        
	  int numnodes = listnodes.size();
	  
	  //  nn tracks all the rows
      nn=0;
	  
	  for (j=0; j < (numnodes); j++)
      {

	  nods = listnodes.elementAt(j);
	  sublist = nods.getChildren();
//    need to find the unmatched tags so must extract rows into subnodes	  
	    subnodes = sublist.extractAllNodesThatMatch(subfiltertr, true);

	    
	    jj = subnodes.size();
//	    subnodes.remove(jj-1);   do this later -- this does not work here
	    
	    nodesubtr = subnodes.elementAt(0);   // first subnode of tr is th
	    sublisttr = nodesubtr.getChildren();
	     
	    subnodesth = sublisttr.extractAllNodesThatMatch(subfilterth);
	    
	       jjj =subnodesth.size();
	 
          //  now extract all row nodes
	     
	 
	  for (n=0; n < (jj); n++,nn++) 
	  {
		  if (n==0)    // header rows only here
	        { 
			   //        for each child node get the text and add it in a loop
	
               textline = "";
			   for (noden =0; noden < jjj; noden++ )
			   {
	

       //   extract node string into nodetext and then find only the text needed           
	            nodetext = (subnodesth.elementAt(noden)).toPlainTextString();
                    nodetext = nodetext.replaceAll("\n",replacestrnull);
	            nodetext = nodetext.replaceAll("\r",replacestrnull);
	            nodetext = nodetext.replaceAll("\n\n",replacestrnull);
	            nodetext = nodetext.replaceAll("amp;",replacestrnull);
	            //    replace multiple spaces by one space or none 
	            //      use eliminate whitespace call or regex
	            nodetext = nodetext.replaceAll("       ", replacestrnull);
	            nodetext = nodetext.replaceAll("	",replacestrspace);
	            //  then concatenate the headers with a delimiter 
	            //    and follow by a new line 
	            textline = textline + nodetext + delim;
			   }

			   textrows[nn] = textline + "\n";
	        }
		  else
		  {
			     //  now extract all row nodes
			    	nodesubtrtd = subnodes.elementAt(n);  // row2 and further in table
			    	sublisttd = nodesubtrtd.getChildren();
			    	    subnodestd = sublisttd.extractAllNodesThatMatch(subfiltertd,true);

			    	    nodetdnum = subnodestd.size();
		      
			    	    textline = "";
						   for (noden =0; noden < jjj; noden++ )
						   {
			
				            nodetext = (subnodestd.elementAt(noden)).toPlainTextString();
			
				             nodetext = nodetext.trim();
				             // remove the commas
				             nodetext = nodetext.replaceAll(",", "");
                                             // remove node starting with Total 
				             if (nodetext.startsWith("TOTAL:"))
				             {  noden =jjj; }
				             else
				             textline = textline + nodetext + delim;
						   }   	  
						   
		                   textrows[nn] = textline +"\n";				   
			
		  
		  }
		  
	      //  print out the data to console also
	          System.out.println(textrows[nn]);
	  }
	  
	  //     add new line between tables  (there are 4 tables)
		 textrows[nn++] = "\n\n";  
		
      }
	  // write all the numbers -- to csv file here 
       numrows=nn;
       WritetoFile(textrows, fname, numrows = nn); 

	 }
	   
	


	   private static void WritetoFile(String textlines[], String filname, int numlines) throws ParserException, IOException
		 {
		
			 FileWriter ff = new FileWriter(filname);
			    Node nodes;
		       int j= 0;
		        
		       for (j=0; j <numlines; j++)
		       {
//	               insert commas or tabs into file or convert space to comma if needed here
//                       otherwise just write each line to file    	   
			     ff.write(textlines[j]);
	
			     }
	
		 ff.close();
			 
		 }

	   
	   
}

