上一篇,我们搭建了hadoop的集群环境(双节点)
hadoop天生就是集群,哪怕只有一个节点也是个单节点的集群,在hadoop中底层默认使用了HDFS文件系统,mapreduce是基于HDFS文件系统上的运行模型(框架),而yarn是hadoop2.x版本后从mapreduce框架中分离出的资源调度框架,关于yarn框架我们后面再细说。
咳咳,现在回到正题。
在hadoop中HDFS是自带(默认)的分布式文件系统,它能够存储极其丰富的海量的数据,并提供负载均衡及副本容错机制(非常优秀)。HDFS同时提供了web (restFull )api 、java api和命令行api,供用户操作使用(上传、下载文件等操作)
hdfs 命令行api(其实就是hadoop的linux shell命令) :
hadoop fs # 回车后会显示命令帮助
[hadoop@hadoopNode01 ~]$ hadoop fs Usage: hadoop fs [generic options] [-appendToFile <localsrc> ... <dst>] [-cat [-ignoreCrc] <src> ...] [-checksum <src> ...] [-chgrp [-R] GROUP PATH...] [-chmod [-R] <MODE[,MODE]... | OCTALMODE> PATH...] [-chown [-R] [OWNER][:[GROUP]] PATH...] [-copyFromLocal [-f] [-p] [-l] <localsrc> ... <dst>] [-copyToLocal [-p] [-ignoreCrc] [-crc] <src> ... <localdst>] [-count [-q] [-h] <path> ...] [-cp [-f] [-p | -p[topax]] <src> ... <dst>] [-createSnapshot <snapshotDir> [<snapshotName>]] [-deleteSnapshot <snapshotDir> <snapshotName>] [-df [-h] [<path> ...]] [-du [-s] [-h] <path> ...] [-expunge] [-find <path> ... <expression> ...] [-get [-p] [-ignoreCrc] [-crc] <src> ... <localdst>] [-getfacl [-R] <path>] [-getfattr [-R] {-n name | -d} [-e en] <path>] [-getmerge [-nl] <src> <localdst>] [-help [cmd ...]] [-ls [-d] [-h] [-R] [<path> ...]] [-mkdir [-p] <path> ...] [-moveFromLocal <localsrc> ... <dst>] [-moveToLocal <src> <localdst>] [-mv <src> ... <dst>] [-put [-f] [-p] [-l] <localsrc> ... <dst>] [-renameSnapshot <snapshotDir> <oldName> <newName>] [-rm [-f] [-r|-R] [-skipTrash] <src> ...] [-rmdir [--ignore-fail-on-non-empty] <dir> ...] [-setfacl [-R] [{-b|-k} {-m|-x <acl_spec>} <path>]|[--set <acl_spec> <path>]] [-setfattr {-n name [-v value] | -x name} <path>] [-setrep [-R] [-w] <rep> <path> ...] [-stat [format] <path> ...] [-tail [-f] <file>] [-test -[defsz] <path>] [-text [-ignoreCrc] <src> ...] [-touchz <path> ...] [-truncate [-w] <length> <path> ...] [-usage [cmd ...]] Generic options supported are -conf <configuration file> specify an application configuration file -D <property=value> use value for given property -fs <local|namenode:port> specify a namenode -jt <local|resourcemanager:port> specify a ResourceManager -files <comma separated list of files> specify comma separated files to be copied to the map reduce cluster -libjars <comma separated list of jars> specify comma separated jar files to include in the classpath. -archives <comma separated list of archives> specify comma separated archives to be unarchived on the compute machines. The general command line syntax is
笔者常用的命令如下:
hadoop fs -put local remote 上传本地文件到hdfs
hadoop fs -get remote local 下载hdfs上文件到本地
hadoop fs -ls path 查看hdfs上某目录下的文件(目录)
hadoop fs -cat remote 查看hdfs上文件
hadoop fs -mkdir [-p] (表示递归创建) path 在hdfs上创建目录
hadoop fs -rm -f -r path 删除hdfs上目录
hadoop fs -rm -f remote 删除hdfs上文件
hadoop fs -mv src dest hdfs上文件移动 (src和dest都是在hdfs上)
hadoop fs -cp src dest hdfs上文件复制(src和dest都是在hdfs上)
hdfs的java 客户端API 示例代码:
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.tingcream</groupId> <artifactId>hadoopStudy</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>hadoopStudy</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> <!--hadoop-common --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.7.6</version> <scope>provided</scope> </dependency> <!-- hadoop-hdfs --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.7.6</version> </dependency> <!-- hadoop-mapreduce-client-core --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>2.7.6</version> </dependency> <!-- hadoop-mapreduce-client-jobclient --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-jobclient</artifactId> <version>2.7.6</version> <scope>provided</scope> </dependency> <!-- hadoop-mapreduce-client-common --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-common</artifactId> <version>2.7.6</version> </dependency> </dependencies> </project>
HdfsDemo.java
package com.tingcream.hadoopStudy.hdfs; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.Before; import org.junit.Test; public class HdfsDemo { /* * FileSystem是一个抽象类,其具体子类有DistributedFileSystem(HDFS文件系统)、FTPFileSystem(FTP文件系统)、 * RawLocalFileSystem(本地文件系统,对应linux中可为ext3、ext4,windows中为ntfs ) */ private FileSystem fs = null; @Before public void init() throws Exception{ //读取classpath下的xxx-site.xml(core-site.xml 、hdfs-site.xml 等) 配置文件,并解析其内容,封装到conf对象中 Configuration conf = new Configuration(); /* * 注意:在你本地(windows环境) 配置host文件,才能解析hadoopNode01和hadoopNode02 到对应ip * 192.168.9.11 hadoopNode01 * 192.168.9.12 hadoopNode02 * */ //也可以在代码中对conf中的配置信息进行手动设置,会覆盖掉配置文件中的读取的值 conf.set("fs.defaultFS", "hdfs://hadoopNode01:9000/"); //根据配置信息,去获取一个具体文件系统的客户端操作实例对象 uri 配置文件 操作用户 fs = FileSystem.get(new URI("hdfs://hadoopNode01:9000/"),conf,"hadoop"); } //上传文件 @Test public void test1(){ try { fs.copyFromLocalFile(new Path("d:/myImg/4.png"), new Path("/")); //fs.copyFromLocalFile(new Path("d:/myImg/4.png"), new Path("hdfs://hadoopNode01:9000/")); //fs.copyFromLocalFile(new Path("d:/myImg/4.png"), new Path("/4-2.png")); //上传同时重命名 //fs.copyFromLocalFile(new Path("d:/myImg/4.png"), new Path("hdfs://hadoopNode01:9000/4-3.png")); //上传同时重命名 System.out.println("ok"); } catch (Exception e) { e.printStackTrace(); } } //下载文件 @Test public void test2(){ try { // ok // fs.copyToLocalFile(false,new Path("/4.png"), new Path("e:/4.png"),true); fs.copyToLocalFile(false,new Path("hdfs://hadoopNode01:9000/4.png"), new Path("e:/4.png"),true); System.out.println("ok"); } catch (Throwable e) { e.printStackTrace(); } } //创建目录ok 可递归创建多层次的目录 @Test public void test3(){ try { //创建目录ok fs.mkdirs(new Path("/aaa/bbb/ccc")); System.out.println("ok"); } catch (Exception e) { e.printStackTrace(); } } //删除目录 ok true可递归删除多层次的目录 @Test public void test4(){ try { //第二个参数true表示递归删除(子目录、子文件) boolean b =fs.delete(new Path("/aaa/bbb"), true); System.out.println("删除成功:"+b); } catch (Exception e) { e.printStackTrace(); } } //列出目录中所有文件(夹) 仅当前目录中的不含子目录中的 @Test public void test5(){ try { System.out.println("---------------------------------"); FileStatus[] listStatus = fs.listStatus(new Path("/aaa/bbb")); for(FileStatus status: listStatus){ String name = status.getPath().getName(); System.out.println(name + (status.isDirectory()?" 是目录":" 是文件")); } System.out.println("ok"); } catch (Exception e) { e.printStackTrace(); } } }
log4j.properties
log4j.rootLogger=DEBUG,stdout log4j.appender.stdout = org.apache.log4j.ConsoleAppender log4j.appender.stdout.Target = System.out log4j.appender.stdout.Threshold = DEBUG log4j.appender.stdout.layout = org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d [%t] %-5p [%c] - %m%n
注:如果你运行java 代码报错
Exceptionin thread "main" java.lang.UnsatisfiedLinkError:org.apache.hadoop.util.NativeCrc32.nativeCompute
....
你需要将windows本地环境编译的dll (hadoop.dll 和winutils.exe)安装到本地。
hadoop-2.7.6中win64的hadoop.dll、winutils.exe 百度云盘下载 :
链接:https://pan.baidu.com/s/1prVE5qPWNi5jwd0hMShX2g 密码:mwst
下载完成后,将hadoop.dll、winutils.exe放入到c:\windows\system32\ 目录中,重启电脑
再尝试运行上面的java代码,发现一切正常了。